1092 lines
38 KiB
Python
1092 lines
38 KiB
Python
"""
|
||
字节直播数据采集工具 - GUI 版本
|
||
整合 commerce 和 entertainment 两个爬虫
|
||
"""
|
||
import sys
|
||
import os
|
||
import threading
|
||
import queue
|
||
import json
|
||
import hashlib
|
||
import requests
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
# 处理打包后的路径
|
||
if getattr(sys, 'frozen', False):
|
||
# 打包后运行
|
||
BASE_DIR = Path(sys._MEIPASS)
|
||
GUI_DIR = BASE_DIR
|
||
CRAWLER_DIR = BASE_DIR
|
||
|
||
# 设置 Playwright 浏览器路径
|
||
playwright_path = BASE_DIR / 'ms-playwright'
|
||
if playwright_path.exists():
|
||
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = str(playwright_path)
|
||
else:
|
||
# 开发模式运行
|
||
GUI_DIR = Path(__file__).parent
|
||
CRAWLER_DIR = GUI_DIR.parent
|
||
|
||
# 添加项目路径
|
||
sys.path.insert(0, str(CRAWLER_DIR))
|
||
|
||
import customtkinter as ctk
|
||
from tkinter import messagebox
|
||
import importlib
|
||
|
||
|
||
def import_with_crawler_context(crawler_type: str, module_path: str):
|
||
"""
|
||
在指定爬虫的上下文中导入模块
|
||
解决 commerce 和 entertainment 都有同名 config/core/utils 模块的冲突问题
|
||
|
||
Args:
|
||
crawler_type: 'commerce' 或 'entertainment'
|
||
module_path: 要导入的模块路径,如 'core.browser_login'
|
||
|
||
Returns:
|
||
导入的模块
|
||
"""
|
||
crawler_path = str(CRAWLER_DIR / crawler_type)
|
||
|
||
# 清除可能冲突的缓存模块
|
||
conflicting_prefixes = ('config', 'core', 'utils', 'services')
|
||
modules_to_remove = [
|
||
name for name in sys.modules.keys()
|
||
if any(name == prefix or name.startswith(prefix + '.') for prefix in conflicting_prefixes)
|
||
]
|
||
for mod_name in modules_to_remove:
|
||
del sys.modules[mod_name]
|
||
|
||
# 确保爬虫路径在 sys.path 最前面
|
||
if crawler_path in sys.path:
|
||
sys.path.remove(crawler_path)
|
||
sys.path.insert(0, crawler_path)
|
||
|
||
# 导入模块(不再在 finally 中移除路径,因为模块运行时可能还需要)
|
||
return importlib.import_module(module_path)
|
||
|
||
|
||
# 设置外观
|
||
ctk.set_appearance_mode("dark")
|
||
ctk.set_default_color_theme("blue")
|
||
|
||
|
||
# 平台配置(与浏览器插件保持一致)
|
||
PLATFORMS = {
|
||
'commerce': {
|
||
'name': '巨量百应-达人账号',
|
||
'loginUrl': 'https://buyin.jinritemai.com/mpa/account/institution-role-select'
|
||
},
|
||
'entertainment': {
|
||
'name': '字节联盟-主播排行',
|
||
'loginUrl': 'https://union.bytedance.com/open/portal/data/leaderboard?appId=3000'
|
||
}
|
||
}
|
||
|
||
|
||
class GlobalConfigManager:
|
||
"""全局配置管理器,用于管理上传服务器配置"""
|
||
|
||
DEFAULT_CONFIG = {
|
||
'serverUrl': 'https://admin.api.skzhijia.com',
|
||
'apiKey': 'sk_7A353DEF2BFD4EADA4BF364CCC7C8FDE2AAD1'
|
||
}
|
||
|
||
def __init__(self):
|
||
# 配置文件保存在 gui 目录下
|
||
if getattr(sys, 'frozen', False):
|
||
self.config_file = Path(sys.executable).parent / 'upload_config.json'
|
||
else:
|
||
self.config_file = GUI_DIR / 'upload_config.json'
|
||
|
||
def load_config(self) -> dict:
|
||
"""加载配置"""
|
||
if self.config_file.exists():
|
||
try:
|
||
with open(self.config_file, 'r', encoding='utf-8') as f:
|
||
config = json.load(f)
|
||
# 合并默认配置
|
||
return {**self.DEFAULT_CONFIG, **config}
|
||
except Exception:
|
||
pass
|
||
return self.DEFAULT_CONFIG.copy()
|
||
|
||
def save_config(self, config: dict) -> bool:
|
||
"""保存配置"""
|
||
try:
|
||
with open(self.config_file, 'w', encoding='utf-8') as f:
|
||
json.dump(config, f, ensure_ascii=False, indent=2)
|
||
return True
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
# 全局配置管理器实例
|
||
global_config_manager = GlobalConfigManager()
|
||
|
||
|
||
class ConfigDialog(ctk.CTkToplevel):
|
||
"""配置对话框"""
|
||
|
||
def __init__(self, parent, crawler_type, on_save_callback=None):
|
||
super().__init__(parent)
|
||
self.crawler_type = crawler_type
|
||
self.on_save_callback = on_save_callback
|
||
|
||
self.title(f"配置 - {crawler_type}")
|
||
self.geometry("500x480")
|
||
self.resizable(False, False)
|
||
|
||
# 模态对话框
|
||
self.transient(parent)
|
||
self.grab_set()
|
||
|
||
self.load_config()
|
||
self.setup_ui()
|
||
|
||
# 居中显示
|
||
self.update_idletasks()
|
||
x = (self.winfo_screenwidth() - self.winfo_width()) // 2
|
||
y = (self.winfo_screenheight() - self.winfo_height()) // 2
|
||
self.geometry(f"+{x}+{y}")
|
||
|
||
def load_config(self):
|
||
"""加载配置"""
|
||
try:
|
||
config_module = import_with_crawler_context(self.crawler_type, 'config.settings')
|
||
self.config_values = {
|
||
'REPORT_URL': getattr(config_module, 'REPORT_URL', ''),
|
||
'REQUEST_TIMEOUT': getattr(config_module, 'REQUEST_TIMEOUT', 30),
|
||
'BROWSER_TIMEOUT': getattr(config_module, 'BROWSER_TIMEOUT', 300),
|
||
'FANS_FETCH_CONCURRENCY': getattr(config_module, 'FANS_FETCH_CONCURRENCY', 5),
|
||
}
|
||
if self.crawler_type == 'commerce':
|
||
self.config_values['DAEMON_INTERVAL_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_HOURS', 1)
|
||
else:
|
||
self.config_values['DAEMON_INTERVAL_NO_FANS_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_NO_FANS_HOURS', 1)
|
||
self.config_values['DAEMON_INTERVAL_WITH_FANS_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_WITH_FANS_HOURS', 6)
|
||
except Exception as e:
|
||
self.config_values = {}
|
||
messagebox.showerror("错误", f"加载配置失败: {e}")
|
||
|
||
# 加载全局上传配置
|
||
self.global_config = global_config_manager.load_config()
|
||
|
||
def setup_ui(self):
|
||
"""设置UI"""
|
||
# 滚动容器
|
||
scroll_frame = ctk.CTkScrollableFrame(self)
|
||
scroll_frame.pack(fill="both", expand=True, padx=10, pady=10)
|
||
|
||
self.entries = {}
|
||
self.global_entries = {}
|
||
|
||
# Cookie 上传配置(全局配置)
|
||
ctk.CTkLabel(scroll_frame, text="Cookie 上传配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(0, 5))
|
||
|
||
# 服务器地址
|
||
self._add_global_entry(scroll_frame, "服务器地址:", 'serverUrl', width=400)
|
||
|
||
# API Key
|
||
self._add_global_entry(scroll_frame, "API Key:", 'apiKey', width=400, show="*")
|
||
|
||
# 通用配置
|
||
ctk.CTkLabel(scroll_frame, text="爬虫配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(15, 5))
|
||
|
||
# 数据上报URL
|
||
self._add_entry(scroll_frame, "数据上报URL:", 'REPORT_URL', width=400)
|
||
|
||
# 请求超时
|
||
self._add_entry(scroll_frame, "请求超时(秒):", 'REQUEST_TIMEOUT', width=100)
|
||
|
||
# 浏览器登录超时
|
||
self._add_entry(scroll_frame, "登录超时(秒):", 'BROWSER_TIMEOUT', width=100)
|
||
|
||
# 粉丝抓取并发数
|
||
self._add_entry(scroll_frame, "粉丝抓取并发数:", 'FANS_FETCH_CONCURRENCY', width=100)
|
||
|
||
# 守护进程间隔
|
||
ctk.CTkLabel(scroll_frame, text="守护进程配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(15, 5))
|
||
|
||
if self.crawler_type == 'commerce':
|
||
self._add_entry(scroll_frame, "抓取间隔(小时):", 'DAEMON_INTERVAL_HOURS', width=100)
|
||
else:
|
||
self._add_entry(scroll_frame, "无粉丝抓取间隔(小时):", 'DAEMON_INTERVAL_NO_FANS_HOURS', width=100)
|
||
self._add_entry(scroll_frame, "含粉丝抓取间隔(小时):", 'DAEMON_INTERVAL_WITH_FANS_HOURS', width=100)
|
||
|
||
# 按钮区域
|
||
btn_frame = ctk.CTkFrame(self)
|
||
btn_frame.pack(fill="x", padx=10, pady=10)
|
||
|
||
ctk.CTkButton(btn_frame, text="保存", width=80, command=self.on_save).pack(side="right", padx=5)
|
||
ctk.CTkButton(btn_frame, text="取消", width=80, command=self.destroy).pack(side="right", padx=5)
|
||
|
||
def _add_entry(self, parent, label, key, width=200):
|
||
"""添加配置项"""
|
||
frame = ctk.CTkFrame(parent, fg_color="transparent")
|
||
frame.pack(fill="x", pady=3)
|
||
|
||
ctk.CTkLabel(frame, text=label, width=150, anchor="w").pack(side="left")
|
||
entry = ctk.CTkEntry(frame, width=width)
|
||
entry.pack(side="left", padx=5)
|
||
|
||
value = self.config_values.get(key, '')
|
||
entry.insert(0, str(value) if value else '')
|
||
self.entries[key] = entry
|
||
|
||
def _add_global_entry(self, parent, label, key, width=200, show=None):
|
||
"""添加全局配置项"""
|
||
frame = ctk.CTkFrame(parent, fg_color="transparent")
|
||
frame.pack(fill="x", pady=3)
|
||
|
||
ctk.CTkLabel(frame, text=label, width=150, anchor="w").pack(side="left")
|
||
entry = ctk.CTkEntry(frame, width=width, show=show)
|
||
entry.pack(side="left", padx=5)
|
||
|
||
value = self.global_config.get(key, '')
|
||
entry.insert(0, str(value) if value else '')
|
||
self.global_entries[key] = entry
|
||
|
||
def on_save(self):
|
||
"""保存配置"""
|
||
# 保存全局上传配置
|
||
new_global_config = {}
|
||
for key, entry in self.global_entries.items():
|
||
new_global_config[key] = entry.get().strip()
|
||
|
||
if global_config_manager.save_config(new_global_config):
|
||
messagebox.showinfo("提示", "配置已保存。\n注意:部分爬虫配置需要重启程序才能生效。")
|
||
else:
|
||
messagebox.showerror("错误", "保存配置失败")
|
||
return
|
||
|
||
if self.on_save_callback:
|
||
self.on_save_callback()
|
||
|
||
self.destroy()
|
||
|
||
|
||
class LogHandler:
|
||
"""日志处理器,将日志重定向到 GUI"""
|
||
def __init__(self, text_widget, log_queue):
|
||
self.text_widget = text_widget
|
||
self.log_queue = log_queue
|
||
|
||
def write(self, message):
|
||
if message.strip():
|
||
self.log_queue.put(message)
|
||
|
||
def flush(self):
|
||
pass
|
||
|
||
|
||
class CrawlerTab(ctk.CTkFrame):
|
||
"""爬虫 Tab 基类"""
|
||
|
||
def __init__(self, parent, crawler_type, log_queue):
|
||
super().__init__(parent)
|
||
self.crawler_type = crawler_type
|
||
self.log_queue = log_queue
|
||
self.running_thread = None
|
||
self.stop_flag = threading.Event()
|
||
self.is_logged_in = False
|
||
self.is_running = False # 是否有任务正在运行
|
||
|
||
self.setup_ui()
|
||
self.check_cookie_status()
|
||
|
||
def setup_ui(self):
|
||
# Cookie 状态区域
|
||
status_frame = ctk.CTkFrame(self)
|
||
status_frame.pack(fill="x", padx=10, pady=10)
|
||
|
||
self.cookie_label = ctk.CTkLabel(
|
||
status_frame,
|
||
text="Cookie 状态: 检查中...",
|
||
font=ctk.CTkFont(size=14)
|
||
)
|
||
self.cookie_label.pack(side="left", padx=10, pady=5)
|
||
|
||
# 按钮区域
|
||
btn_frame = ctk.CTkFrame(self)
|
||
btn_frame.pack(fill="x", padx=10, pady=5)
|
||
|
||
self.login_btn = ctk.CTkButton(
|
||
btn_frame, text="登录", width=80,
|
||
command=self.on_login
|
||
)
|
||
self.login_btn.pack(side="left", padx=5, pady=5)
|
||
|
||
self.logout_btn = ctk.CTkButton(
|
||
btn_frame, text="退出登录", width=80,
|
||
fg_color="orange", hover_color="darkorange",
|
||
command=self.on_logout,
|
||
state="disabled"
|
||
)
|
||
self.logout_btn.pack(side="left", padx=5, pady=5)
|
||
|
||
self.upload_btn = ctk.CTkButton(
|
||
btn_frame, text="上传Cookie", width=90,
|
||
fg_color="#28a745", hover_color="#218838",
|
||
command=self.on_upload_cookie,
|
||
state="disabled"
|
||
)
|
||
self.upload_btn.pack(side="left", padx=5, pady=5)
|
||
|
||
self.status_btn = ctk.CTkButton(
|
||
btn_frame, text="查看状态", width=80,
|
||
command=self.on_status,
|
||
state="disabled"
|
||
)
|
||
self.status_btn.pack(side="left", padx=5, pady=5)
|
||
|
||
self.crawl_btn = ctk.CTkButton(
|
||
btn_frame, text="单次抓取", width=80,
|
||
command=self.on_crawl,
|
||
state="disabled"
|
||
)
|
||
self.crawl_btn.pack(side="left", padx=5, pady=5)
|
||
|
||
self.daemon_btn = ctk.CTkButton(
|
||
btn_frame, text="启动守护", width=80,
|
||
command=self.on_daemon,
|
||
state="disabled"
|
||
)
|
||
self.daemon_btn.pack(side="left", padx=5, pady=5)
|
||
|
||
self.stop_btn = ctk.CTkButton(
|
||
btn_frame, text="停止", width=80,
|
||
fg_color="red", hover_color="darkred",
|
||
command=self.on_stop,
|
||
state="disabled"
|
||
)
|
||
self.stop_btn.pack(side="left", padx=5, pady=5)
|
||
|
||
self.config_btn = ctk.CTkButton(
|
||
btn_frame, text="配置", width=80,
|
||
fg_color="gray", hover_color="darkgray",
|
||
command=self.on_config
|
||
)
|
||
self.config_btn.pack(side="left", padx=5, pady=5)
|
||
|
||
# 参数设置区域
|
||
params_frame = ctk.CTkFrame(self)
|
||
params_frame.pack(fill="x", padx=10, pady=5)
|
||
|
||
self.setup_params(params_frame)
|
||
|
||
def setup_params(self, parent):
|
||
"""子类重写此方法设置参数"""
|
||
pass
|
||
|
||
def log(self, message):
|
||
"""写入日志"""
|
||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
self.log_queue.put(f"[{timestamp}] [{self.crawler_type}] {message}\n")
|
||
|
||
def check_cookie_status(self):
|
||
"""检查 Cookie 状态"""
|
||
try:
|
||
cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager')
|
||
CookieManager = cookie_module.CookieManager
|
||
|
||
cookie_manager = CookieManager()
|
||
if cookie_manager.is_cookie_exists():
|
||
cookie_data = cookie_manager.load_cookies()
|
||
if cookie_data:
|
||
metadata = cookie_data.get("metadata", {})
|
||
username = metadata.get("username", "未知")
|
||
self.cookie_label.configure(
|
||
text=f"Cookie 状态: 已登录 ({username})",
|
||
text_color="green"
|
||
)
|
||
self.is_logged_in = True
|
||
self.update_buttons_state()
|
||
return
|
||
|
||
self.cookie_label.configure(
|
||
text="Cookie 状态: 未登录",
|
||
text_color="red"
|
||
)
|
||
self.is_logged_in = False
|
||
self.update_buttons_state()
|
||
except Exception as e:
|
||
self.cookie_label.configure(
|
||
text=f"Cookie 状态: 检查失败 ({e})",
|
||
text_color="orange"
|
||
)
|
||
self.is_logged_in = False
|
||
self.update_buttons_state()
|
||
|
||
def update_buttons_state(self):
|
||
"""根据登录状态和运行状态更新按钮"""
|
||
if self.is_running:
|
||
# 任务运行中:只启用停止按钮
|
||
self.login_btn.configure(state="disabled")
|
||
self.logout_btn.configure(state="disabled")
|
||
self.upload_btn.configure(state="disabled")
|
||
self.status_btn.configure(state="disabled")
|
||
self.crawl_btn.configure(state="disabled")
|
||
self.daemon_btn.configure(state="disabled")
|
||
self.stop_btn.configure(state="normal")
|
||
self.config_btn.configure(state="disabled")
|
||
elif self.is_logged_in:
|
||
# 已登录且未运行:禁用登录,启用其他功能
|
||
self.login_btn.configure(state="disabled")
|
||
self.logout_btn.configure(state="normal")
|
||
self.upload_btn.configure(state="normal")
|
||
self.status_btn.configure(state="normal")
|
||
self.crawl_btn.configure(state="normal")
|
||
self.daemon_btn.configure(state="normal")
|
||
self.stop_btn.configure(state="disabled")
|
||
self.config_btn.configure(state="normal")
|
||
else:
|
||
# 未登录:只启用登录按钮
|
||
self.login_btn.configure(state="normal")
|
||
self.logout_btn.configure(state="disabled")
|
||
self.upload_btn.configure(state="disabled")
|
||
self.status_btn.configure(state="disabled")
|
||
self.crawl_btn.configure(state="disabled")
|
||
self.daemon_btn.configure(state="disabled")
|
||
self.stop_btn.configure(state="disabled")
|
||
self.config_btn.configure(state="normal")
|
||
|
||
def on_login(self):
|
||
"""登录"""
|
||
self.log("正在启动浏览器登录...")
|
||
self.is_running = True
|
||
self.update_buttons_state()
|
||
|
||
def run():
|
||
try:
|
||
browser_module = import_with_crawler_context(self.crawler_type, 'core.browser_login')
|
||
BrowserLogin = browser_module.BrowserLogin
|
||
|
||
browser_login = BrowserLogin()
|
||
cookies = browser_login.trigger_login()
|
||
|
||
if cookies:
|
||
self.log("登录成功!")
|
||
else:
|
||
self.log("登录失败或被取消")
|
||
|
||
except Exception as e:
|
||
self.log(f"登录出错: {e}")
|
||
finally:
|
||
# 确保在主线程更新UI
|
||
self.after(0, self._on_login_complete)
|
||
|
||
self.running_thread = threading.Thread(target=run, daemon=True)
|
||
self.running_thread.start()
|
||
|
||
def _on_login_complete(self):
|
||
"""登录完成后的处理"""
|
||
self.is_running = False
|
||
self.check_cookie_status()
|
||
|
||
def on_logout(self):
|
||
"""退出登录"""
|
||
if messagebox.askyesno("确认", "确定要退出登录吗?\n这将清除保存的Cookie。"):
|
||
try:
|
||
cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager')
|
||
CookieManager = cookie_module.CookieManager
|
||
|
||
cookie_manager = CookieManager()
|
||
cookie_file = cookie_manager.cookie_file
|
||
|
||
if cookie_file.exists():
|
||
cookie_file.unlink()
|
||
self.log("已退出登录,Cookie 已清除")
|
||
else:
|
||
self.log("Cookie 文件不存在")
|
||
|
||
self.check_cookie_status()
|
||
except Exception as e:
|
||
self.log(f"退出登录失败: {e}")
|
||
|
||
def on_config(self):
|
||
"""打开配置对话框"""
|
||
ConfigDialog(self.winfo_toplevel(), self.crawler_type)
|
||
|
||
def on_status(self):
|
||
"""查看状态"""
|
||
try:
|
||
cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager')
|
||
CookieManager = cookie_module.CookieManager
|
||
|
||
cookie_manager = CookieManager()
|
||
|
||
if cookie_manager.is_cookie_exists():
|
||
cookie_data = cookie_manager.load_cookies()
|
||
if cookie_data:
|
||
metadata = cookie_data.get("metadata", {})
|
||
cookies = cookie_data.get("cookies", [])
|
||
|
||
info = f"""Cookie 状态信息:
|
||
文件路径: {cookie_manager.cookie_file}
|
||
Cookie 数量: {len(cookies)}
|
||
用户名: {metadata.get('username', '未知')}
|
||
创建时间: {metadata.get('created_at', '未知')}
|
||
最后验证: {metadata.get('last_validated', '未知')}
|
||
登录URL: {metadata.get('url', '未知')}"""
|
||
self.log(info)
|
||
else:
|
||
self.log("Cookie 文件存在但加载失败")
|
||
else:
|
||
self.log("Cookie 文件不存在,请先登录")
|
||
|
||
self.check_cookie_status()
|
||
except Exception as e:
|
||
self.log(f"查看状态出错: {e}")
|
||
|
||
def on_upload_cookie(self):
|
||
"""上传 Cookie 到服务器"""
|
||
# 检查配置
|
||
config = global_config_manager.load_config()
|
||
server_url = config.get('serverUrl', '').strip()
|
||
api_key = config.get('apiKey', '').strip()
|
||
|
||
if not server_url or not api_key:
|
||
messagebox.showwarning("配置缺失", "请先在配置中设置服务器地址和 API Key")
|
||
return
|
||
|
||
self.log("正在上传 Cookie 到服务器...")
|
||
self.set_running_state(True)
|
||
|
||
def run():
|
||
try:
|
||
# 获取 cookie 数据
|
||
cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager')
|
||
CookieManager = cookie_module.CookieManager
|
||
|
||
cookie_manager = CookieManager()
|
||
if not cookie_manager.is_cookie_exists():
|
||
self.log("Cookie 文件不存在,请先登录")
|
||
return
|
||
|
||
cookie_data = cookie_manager.load_cookies()
|
||
if not cookie_data:
|
||
self.log("加载 Cookie 失败")
|
||
return
|
||
|
||
cookies = cookie_data.get("cookies", [])
|
||
if not cookies:
|
||
self.log("Cookie 为空")
|
||
return
|
||
|
||
# 将 cookie 列表转换为字符串格式:name=value; name2=value2
|
||
cookie_string = '; '.join([f"{c['name']}={c['value']}" for c in cookies])
|
||
|
||
# 计算 MD5
|
||
md5_hash = hashlib.md5(cookie_string.encode('utf-8')).hexdigest()
|
||
|
||
# 获取平台配置
|
||
platform = PLATFORMS.get(self.crawler_type, {})
|
||
login_url = platform.get('loginUrl', '')
|
||
|
||
# 上传到服务器
|
||
upload_url = f"{server_url}/api/crawler/cookie/upload"
|
||
payload = {
|
||
'code': self.crawler_type,
|
||
'cookieString': cookie_string,
|
||
'md5': md5_hash,
|
||
'url': login_url
|
||
}
|
||
headers = {
|
||
'X-API-Key': api_key,
|
||
'Content-Type': 'application/json'
|
||
}
|
||
|
||
self.log(f"正在上传到: {upload_url}")
|
||
response = requests.post(upload_url, json=payload, headers=headers, timeout=30)
|
||
|
||
if response.status_code != 200:
|
||
self.log(f"上传失败: HTTP {response.status_code} - {response.text}")
|
||
return
|
||
|
||
result = response.json()
|
||
if result.get('code') == 200:
|
||
self.log(f"Cookie 上传成功!")
|
||
if result.get('data'):
|
||
self.log(f"服务器返回: {result.get('data')}")
|
||
else:
|
||
self.log(f"上传失败: {result.get('msg', '未知错误')}")
|
||
|
||
except requests.exceptions.Timeout:
|
||
self.log("上传超时,请检查网络连接")
|
||
except requests.exceptions.ConnectionError:
|
||
self.log("连接服务器失败,请检查服务器地址")
|
||
except Exception as e:
|
||
self.log(f"上传出错: {e}")
|
||
finally:
|
||
self.after(0, lambda: self.set_running_state(False))
|
||
|
||
self.running_thread = threading.Thread(target=run, daemon=True)
|
||
self.running_thread.start()
|
||
|
||
def on_crawl(self):
|
||
"""单次抓取 - 子类重写"""
|
||
pass
|
||
|
||
def on_daemon(self):
|
||
"""守护进程 - 子类重写"""
|
||
pass
|
||
|
||
def on_stop(self):
|
||
"""停止"""
|
||
self.stop_flag.set()
|
||
self.log("正在停止...")
|
||
# 给一个短暂的延迟后检查任务是否已停止
|
||
self.after(1000, self._check_task_stopped)
|
||
|
||
def _check_task_stopped(self):
|
||
"""检查任务是否已停止"""
|
||
if self.running_thread and self.running_thread.is_alive():
|
||
# 任务仍在运行,继续等待
|
||
self.after(1000, self._check_task_stopped)
|
||
else:
|
||
# 任务已停止
|
||
self.is_running = False
|
||
self.update_buttons_state()
|
||
self.log("任务已停止")
|
||
|
||
def set_running_state(self, running):
|
||
"""设置运行状态"""
|
||
self.is_running = running
|
||
if running:
|
||
self.stop_flag.clear()
|
||
self.update_buttons_state()
|
||
|
||
|
||
class CommerceTab(CrawlerTab):
|
||
"""巨量百应 Tab"""
|
||
|
||
def __init__(self, parent, log_queue):
|
||
super().__init__(parent, "commerce", log_queue)
|
||
|
||
def setup_params(self, parent):
|
||
# 每页数量
|
||
ctk.CTkLabel(parent, text="每页数量:").pack(side="left", padx=(10, 5), pady=5)
|
||
self.page_size_var = ctk.StringVar(value="20")
|
||
self.page_size_entry = ctk.CTkEntry(parent, width=60, textvariable=self.page_size_var)
|
||
self.page_size_entry.pack(side="left", padx=5, pady=5)
|
||
|
||
# 间隔时间
|
||
ctk.CTkLabel(parent, text="抓取间隔(小时):").pack(side="left", padx=(20, 5), pady=5)
|
||
self.interval_var = ctk.StringVar(value="1")
|
||
self.interval_entry = ctk.CTkEntry(parent, width=60, textvariable=self.interval_var)
|
||
self.interval_entry.pack(side="left", padx=5, pady=5)
|
||
|
||
def on_crawl(self):
|
||
"""单次抓取"""
|
||
self.log("开始抓取达人账号数据...")
|
||
self.set_running_state(True)
|
||
|
||
def run():
|
||
try:
|
||
service_module = import_with_crawler_context('commerce', 'services.daren_account_service')
|
||
DarenAccountService = service_module.DarenAccountService
|
||
|
||
page_size = int(self.page_size_var.get())
|
||
service = DarenAccountService()
|
||
filepath = service.crawl(page_size=page_size, status=1)
|
||
|
||
if filepath:
|
||
self.log(f"抓取完成! 文件: {filepath}")
|
||
else:
|
||
self.log("抓取失败")
|
||
except Exception as e:
|
||
self.log(f"抓取出错: {e}")
|
||
finally:
|
||
self.after(0, lambda: self.set_running_state(False))
|
||
|
||
self.running_thread = threading.Thread(target=run, daemon=True)
|
||
self.running_thread.start()
|
||
|
||
def on_daemon(self):
|
||
"""守护进程模式"""
|
||
self.log("启动守护进程模式...")
|
||
self.set_running_state(True)
|
||
|
||
def run():
|
||
try:
|
||
import time
|
||
service_module = import_with_crawler_context('commerce', 'services.daren_account_service')
|
||
DarenAccountService = service_module.DarenAccountService
|
||
|
||
page_size = int(self.page_size_var.get())
|
||
interval_hours = float(self.interval_var.get())
|
||
interval_seconds = interval_hours * 3600
|
||
|
||
self.log(f"守护进程已启动 - 间隔: {interval_hours} 小时")
|
||
|
||
cycle = 0
|
||
while not self.stop_flag.is_set():
|
||
cycle += 1
|
||
self.log(f"第 {cycle} 次抓取开始...")
|
||
|
||
try:
|
||
service = DarenAccountService()
|
||
filepath = service.crawl(page_size=page_size, status=1)
|
||
|
||
if filepath:
|
||
self.log(f"第 {cycle} 次抓取完成: {filepath.name}")
|
||
else:
|
||
self.log(f"第 {cycle} 次抓取失败")
|
||
except Exception as e:
|
||
self.log(f"第 {cycle} 次抓取出错: {e}")
|
||
|
||
# 等待下次抓取
|
||
self.log(f"等待 {interval_hours} 小时后进行下次抓取...")
|
||
|
||
# 分段等待,以便响应停止信号
|
||
waited = 0
|
||
while waited < interval_seconds and not self.stop_flag.is_set():
|
||
time.sleep(min(30, interval_seconds - waited))
|
||
waited += 30
|
||
|
||
self.log("守护进程已停止")
|
||
except Exception as e:
|
||
self.log(f"守护进程出错: {e}")
|
||
finally:
|
||
self.after(0, lambda: self.set_running_state(False))
|
||
|
||
self.running_thread = threading.Thread(target=run, daemon=True)
|
||
self.running_thread.start()
|
||
|
||
|
||
class EntertainmentTab(CrawlerTab):
|
||
"""娱乐版 Tab"""
|
||
|
||
def __init__(self, parent, log_queue):
|
||
super().__init__(parent, "entertainment", log_queue)
|
||
|
||
def setup_params(self, parent):
|
||
# 每页数量
|
||
ctk.CTkLabel(parent, text="每页数量:").pack(side="left", padx=(10, 5), pady=5)
|
||
self.size_var = ctk.StringVar(value="10")
|
||
self.size_entry = ctk.CTkEntry(parent, width=60, textvariable=self.size_var)
|
||
self.size_entry.pack(side="left", padx=5, pady=5)
|
||
|
||
# 抓取粉丝
|
||
self.fetch_fans_var = ctk.BooleanVar(value=True)
|
||
self.fetch_fans_cb = ctk.CTkCheckBox(
|
||
parent, text="抓取粉丝数据",
|
||
variable=self.fetch_fans_var
|
||
)
|
||
self.fetch_fans_cb.pack(side="left", padx=20, pady=5)
|
||
|
||
# 间隔时间
|
||
ctk.CTkLabel(parent, text="抓取间隔(小时):").pack(side="left", padx=(20, 5), pady=5)
|
||
self.interval_var = ctk.StringVar(value="1")
|
||
self.interval_entry = ctk.CTkEntry(parent, width=60, textvariable=self.interval_var)
|
||
self.interval_entry.pack(side="left", padx=5, pady=5)
|
||
|
||
def on_crawl(self):
|
||
"""单次抓取"""
|
||
self.log("开始抓取排行榜数据...")
|
||
self.set_running_state(True)
|
||
|
||
def run():
|
||
try:
|
||
service_module = import_with_crawler_context('entertainment', 'services.crawler_service')
|
||
CrawlerService = service_module.CrawlerService
|
||
|
||
size = int(self.size_var.get())
|
||
fetch_fans = self.fetch_fans_var.get()
|
||
|
||
service = CrawlerService()
|
||
filepath = service.crawl(
|
||
rank_type=0,
|
||
size=size,
|
||
filter_type='anchor',
|
||
fetch_fans=fetch_fans
|
||
)
|
||
|
||
if filepath:
|
||
self.log(f"抓取完成! 文件: {filepath}")
|
||
else:
|
||
self.log("抓取失败")
|
||
except Exception as e:
|
||
self.log(f"抓取出错: {e}")
|
||
finally:
|
||
self.after(0, lambda: self.set_running_state(False))
|
||
|
||
self.running_thread = threading.Thread(target=run, daemon=True)
|
||
self.running_thread.start()
|
||
|
||
def on_daemon(self):
|
||
"""守护进程模式"""
|
||
self.log("启动守护进程模式...")
|
||
self.set_running_state(True)
|
||
|
||
def run():
|
||
try:
|
||
import time
|
||
service_module = import_with_crawler_context('entertainment', 'services.crawler_service')
|
||
CrawlerService = service_module.CrawlerService
|
||
|
||
size = int(self.size_var.get())
|
||
fetch_fans = self.fetch_fans_var.get()
|
||
interval_hours = float(self.interval_var.get())
|
||
interval_seconds = interval_hours * 3600
|
||
|
||
self.log(f"守护进程已启动 - 间隔: {interval_hours} 小时")
|
||
|
||
cycle = 0
|
||
while not self.stop_flag.is_set():
|
||
cycle += 1
|
||
self.log(f"第 {cycle} 次抓取开始...")
|
||
|
||
try:
|
||
service = CrawlerService()
|
||
filepath = service.crawl(
|
||
rank_type=0,
|
||
size=size,
|
||
filter_type='anchor',
|
||
fetch_fans=fetch_fans
|
||
)
|
||
|
||
if filepath:
|
||
self.log(f"第 {cycle} 次抓取完成: {filepath.name}")
|
||
else:
|
||
self.log(f"第 {cycle} 次抓取失败")
|
||
except Exception as e:
|
||
self.log(f"第 {cycle} 次抓取出错: {e}")
|
||
|
||
# 等待下次抓取
|
||
self.log(f"等待 {interval_hours} 小时后进行下次抓取...")
|
||
|
||
# 分段等待
|
||
waited = 0
|
||
while waited < interval_seconds and not self.stop_flag.is_set():
|
||
time.sleep(min(30, interval_seconds - waited))
|
||
waited += 30
|
||
|
||
self.log("守护进程已停止")
|
||
except Exception as e:
|
||
self.log(f"守护进程出错: {e}")
|
||
finally:
|
||
self.after(0, lambda: self.set_running_state(False))
|
||
|
||
self.running_thread = threading.Thread(target=run, daemon=True)
|
||
self.running_thread.start()
|
||
|
||
|
||
class App(ctk.CTk):
|
||
"""主应用程序"""
|
||
|
||
# 日志最大行数,超过后自动截断
|
||
MAX_LOG_LINES = 1000
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
|
||
self.title("字节直播数据采集工具")
|
||
self.geometry("900x650")
|
||
self.minsize(800, 550)
|
||
|
||
# 日志队列
|
||
self.log_queue = queue.Queue()
|
||
|
||
# 重定向标准输出到日志
|
||
self.setup_stdout_redirect()
|
||
|
||
self.setup_ui()
|
||
self.process_log_queue()
|
||
|
||
def setup_stdout_redirect(self):
|
||
"""重定向标准输出和错误输出到日志队列"""
|
||
class StdoutRedirector:
|
||
def __init__(self, log_queue, original_stdout):
|
||
self.log_queue = log_queue
|
||
self.original_stdout = original_stdout
|
||
self.buffer = ""
|
||
|
||
def write(self, message):
|
||
# 同时输出到原始stdout(控制台)
|
||
if self.original_stdout:
|
||
try:
|
||
self.original_stdout.write(message)
|
||
except:
|
||
pass
|
||
|
||
# 处理消息,按行分割
|
||
if message:
|
||
self.buffer += message
|
||
while '\n' in self.buffer:
|
||
line, self.buffer = self.buffer.split('\n', 1)
|
||
if line.strip():
|
||
self.log_queue.put(f"{line}\n")
|
||
|
||
def flush(self):
|
||
if self.original_stdout:
|
||
try:
|
||
self.original_stdout.flush()
|
||
except:
|
||
pass
|
||
# 刷新缓冲区中剩余的内容
|
||
if self.buffer.strip():
|
||
self.log_queue.put(f"{self.buffer}\n")
|
||
self.buffer = ""
|
||
|
||
# 保存原始的stdout和stderr
|
||
self.original_stdout = sys.stdout
|
||
self.original_stderr = sys.stderr
|
||
|
||
# 重定向
|
||
sys.stdout = StdoutRedirector(self.log_queue, self.original_stdout)
|
||
sys.stderr = StdoutRedirector(self.log_queue, self.original_stderr)
|
||
|
||
def setup_ui(self):
|
||
# 标题
|
||
title_label = ctk.CTkLabel(
|
||
self,
|
||
text="直播数据采集工具",
|
||
font=ctk.CTkFont(size=20, weight="bold")
|
||
)
|
||
title_label.pack(pady=10)
|
||
|
||
# Tab 视图
|
||
self.tabview = ctk.CTkTabview(self)
|
||
self.tabview.pack(fill="both", expand=True, padx=10, pady=5)
|
||
|
||
# 添加 Tab
|
||
self.tabview.add("巨量百应")
|
||
self.tabview.add("字节联盟-主播排行")
|
||
|
||
# 创建 Tab 内容
|
||
self.commerce_tab = CommerceTab(
|
||
self.tabview.tab("巨量百应"),
|
||
self.log_queue
|
||
)
|
||
self.commerce_tab.pack(fill="both", expand=True)
|
||
|
||
self.entertainment_tab = EntertainmentTab(
|
||
self.tabview.tab("字节联盟-主播排行"),
|
||
self.log_queue
|
||
)
|
||
self.entertainment_tab.pack(fill="both", expand=True)
|
||
|
||
# 设置默认选中的 Tab
|
||
self.tabview.set("字节联盟-主播排行")
|
||
|
||
# 日志区域
|
||
log_frame = ctk.CTkFrame(self)
|
||
log_frame.pack(fill="both", expand=True, padx=10, pady=10)
|
||
|
||
# 日志标题行
|
||
log_header = ctk.CTkFrame(log_frame, fg_color="transparent")
|
||
log_header.pack(fill="x", padx=5, pady=(5, 0))
|
||
|
||
log_label = ctk.CTkLabel(log_header, text="运行日志:", anchor="w")
|
||
log_label.pack(side="left")
|
||
|
||
# 日志行数显示
|
||
self.log_count_label = ctk.CTkLabel(log_header, text="(0 行)", anchor="w", text_color="gray")
|
||
self.log_count_label.pack(side="left", padx=5)
|
||
|
||
self.log_text = ctk.CTkTextbox(log_frame, height=150)
|
||
self.log_text.pack(fill="both", expand=True, padx=5, pady=5)
|
||
|
||
# 按钮区域
|
||
btn_frame = ctk.CTkFrame(log_frame, fg_color="transparent")
|
||
btn_frame.pack(fill="x", padx=5, pady=5)
|
||
|
||
# 左侧按钮
|
||
clear_btn = ctk.CTkButton(
|
||
btn_frame, text="清空日志", width=80,
|
||
command=self.clear_log
|
||
)
|
||
clear_btn.pack(side="left", padx=5)
|
||
|
||
# 右侧按钮
|
||
exit_btn = ctk.CTkButton(
|
||
btn_frame, text="退出程序", width=80,
|
||
fg_color="red", hover_color="darkred",
|
||
command=self.on_exit
|
||
)
|
||
exit_btn.pack(side="right", padx=5)
|
||
|
||
# 初始日志
|
||
self.log("程序启动完成,请选择爬虫类型并操作")
|
||
|
||
def log(self, message):
|
||
"""写入日志"""
|
||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
self.log_queue.put(f"[{timestamp}] {message}\n")
|
||
|
||
def clear_log(self):
|
||
"""清空日志"""
|
||
self.log_text.delete("1.0", "end")
|
||
self.update_log_count()
|
||
|
||
def update_log_count(self):
|
||
"""更新日志行数显示"""
|
||
try:
|
||
# 获取行数
|
||
content = self.log_text.get("1.0", "end-1c")
|
||
line_count = content.count('\n') + 1 if content else 0
|
||
self.log_count_label.configure(text=f"({line_count} 行)")
|
||
except:
|
||
pass
|
||
|
||
def truncate_log_if_needed(self):
|
||
"""如果日志超过最大行数,截断旧日志"""
|
||
try:
|
||
content = self.log_text.get("1.0", "end-1c")
|
||
lines = content.split('\n')
|
||
|
||
if len(lines) > self.MAX_LOG_LINES:
|
||
# 保留最新的日志
|
||
keep_lines = lines[-self.MAX_LOG_LINES:]
|
||
self.log_text.delete("1.0", "end")
|
||
self.log_text.insert("1.0", '\n'.join(keep_lines))
|
||
# 添加截断提示
|
||
self.log_text.insert("1.0", f"[日志已截断,保留最新 {self.MAX_LOG_LINES} 行]\n")
|
||
except:
|
||
pass
|
||
|
||
def process_log_queue(self):
|
||
"""处理日志队列"""
|
||
message_count = 0
|
||
try:
|
||
while True:
|
||
message = self.log_queue.get_nowait()
|
||
self.log_text.insert("end", message)
|
||
message_count += 1
|
||
|
||
# 每处理50条消息检查一次是否需要截断
|
||
if message_count % 50 == 0:
|
||
self.truncate_log_if_needed()
|
||
except queue.Empty:
|
||
pass
|
||
|
||
if message_count > 0:
|
||
self.log_text.see("end")
|
||
self.update_log_count()
|
||
# 处理完一批消息后检查截断
|
||
self.truncate_log_if_needed()
|
||
|
||
# 每 100ms 检查一次
|
||
self.after(100, self.process_log_queue)
|
||
|
||
def on_exit(self):
|
||
"""退出程序"""
|
||
if messagebox.askyesno("确认", "确定要退出程序吗?"):
|
||
# 恢复标准输出
|
||
sys.stdout = self.original_stdout
|
||
sys.stderr = self.original_stderr
|
||
self.destroy()
|
||
|
||
|
||
def main():
|
||
app = App()
|
||
app.mainloop()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|