""" 字节直播数据采集工具 - GUI 版本 整合 commerce 和 entertainment 两个爬虫 """ import sys import os import threading import queue import json import hashlib import requests from datetime import datetime from pathlib import Path # 处理打包后的路径 if getattr(sys, 'frozen', False): # 打包后运行 BASE_DIR = Path(sys._MEIPASS) GUI_DIR = BASE_DIR CRAWLER_DIR = BASE_DIR # 设置 Playwright 浏览器路径 playwright_path = BASE_DIR / 'ms-playwright' if playwright_path.exists(): os.environ['PLAYWRIGHT_BROWSERS_PATH'] = str(playwright_path) else: # 开发模式运行 GUI_DIR = Path(__file__).parent CRAWLER_DIR = GUI_DIR.parent # 添加项目路径 sys.path.insert(0, str(CRAWLER_DIR)) import customtkinter as ctk from tkinter import messagebox import importlib def import_with_crawler_context(crawler_type: str, module_path: str): """ 在指定爬虫的上下文中导入模块 解决 commerce 和 entertainment 都有同名 config/core/utils 模块的冲突问题 Args: crawler_type: 'commerce' 或 'entertainment' module_path: 要导入的模块路径,如 'core.browser_login' Returns: 导入的模块 """ crawler_path = str(CRAWLER_DIR / crawler_type) # 清除可能冲突的缓存模块 conflicting_prefixes = ('config', 'core', 'utils', 'services') modules_to_remove = [ name for name in sys.modules.keys() if any(name == prefix or name.startswith(prefix + '.') for prefix in conflicting_prefixes) ] for mod_name in modules_to_remove: del sys.modules[mod_name] # 确保爬虫路径在 sys.path 最前面 if crawler_path in sys.path: sys.path.remove(crawler_path) sys.path.insert(0, crawler_path) # 导入模块(不再在 finally 中移除路径,因为模块运行时可能还需要) return importlib.import_module(module_path) # 设置外观 ctk.set_appearance_mode("dark") ctk.set_default_color_theme("blue") # 平台配置(与浏览器插件保持一致) PLATFORMS = { 'commerce': { 'name': '巨量百应-达人账号', 'loginUrl': 'https://buyin.jinritemai.com/mpa/account/institution-role-select' }, 'entertainment': { 'name': '字节联盟-主播排行', 'loginUrl': 'https://union.bytedance.com/open/portal/data/leaderboard?appId=3000' } } class GlobalConfigManager: """全局配置管理器,用于管理上传服务器配置""" DEFAULT_CONFIG = { 'serverUrl': 'https://admin.api.skzhijia.com', 'apiKey': 'sk_7A353DEF2BFD4EADA4BF364CCC7C8FDE2AAD1' } def __init__(self): # 配置文件保存在 gui 目录下 if getattr(sys, 'frozen', False): self.config_file = Path(sys.executable).parent / 'upload_config.json' else: self.config_file = GUI_DIR / 'upload_config.json' def load_config(self) -> dict: """加载配置""" if self.config_file.exists(): try: with open(self.config_file, 'r', encoding='utf-8') as f: config = json.load(f) # 合并默认配置 return {**self.DEFAULT_CONFIG, **config} except Exception: pass return self.DEFAULT_CONFIG.copy() def save_config(self, config: dict) -> bool: """保存配置""" try: with open(self.config_file, 'w', encoding='utf-8') as f: json.dump(config, f, ensure_ascii=False, indent=2) return True except Exception: return False # 全局配置管理器实例 global_config_manager = GlobalConfigManager() class ConfigDialog(ctk.CTkToplevel): """配置对话框""" def __init__(self, parent, crawler_type, on_save_callback=None): super().__init__(parent) self.crawler_type = crawler_type self.on_save_callback = on_save_callback self.title(f"配置 - {crawler_type}") self.geometry("500x480") self.resizable(False, False) # 模态对话框 self.transient(parent) self.grab_set() self.load_config() self.setup_ui() # 居中显示 self.update_idletasks() x = (self.winfo_screenwidth() - self.winfo_width()) // 2 y = (self.winfo_screenheight() - self.winfo_height()) // 2 self.geometry(f"+{x}+{y}") def load_config(self): """加载配置""" try: config_module = import_with_crawler_context(self.crawler_type, 'config.settings') self.config_values = { 'REPORT_URL': getattr(config_module, 'REPORT_URL', ''), 'REQUEST_TIMEOUT': getattr(config_module, 'REQUEST_TIMEOUT', 30), 'BROWSER_TIMEOUT': getattr(config_module, 'BROWSER_TIMEOUT', 300), 'FANS_FETCH_CONCURRENCY': getattr(config_module, 'FANS_FETCH_CONCURRENCY', 5), } if self.crawler_type == 'commerce': self.config_values['DAEMON_INTERVAL_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_HOURS', 1) else: self.config_values['DAEMON_INTERVAL_NO_FANS_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_NO_FANS_HOURS', 1) self.config_values['DAEMON_INTERVAL_WITH_FANS_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_WITH_FANS_HOURS', 6) except Exception as e: self.config_values = {} messagebox.showerror("错误", f"加载配置失败: {e}") # 加载全局上传配置 self.global_config = global_config_manager.load_config() def setup_ui(self): """设置UI""" # 滚动容器 scroll_frame = ctk.CTkScrollableFrame(self) scroll_frame.pack(fill="both", expand=True, padx=10, pady=10) self.entries = {} self.global_entries = {} # Cookie 上传配置(全局配置) ctk.CTkLabel(scroll_frame, text="Cookie 上传配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(0, 5)) # 服务器地址 self._add_global_entry(scroll_frame, "服务器地址:", 'serverUrl', width=400) # API Key self._add_global_entry(scroll_frame, "API Key:", 'apiKey', width=400, show="*") # 通用配置 ctk.CTkLabel(scroll_frame, text="爬虫配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(15, 5)) # 数据上报URL self._add_entry(scroll_frame, "数据上报URL:", 'REPORT_URL', width=400) # 请求超时 self._add_entry(scroll_frame, "请求超时(秒):", 'REQUEST_TIMEOUT', width=100) # 浏览器登录超时 self._add_entry(scroll_frame, "登录超时(秒):", 'BROWSER_TIMEOUT', width=100) # 粉丝抓取并发数 self._add_entry(scroll_frame, "粉丝抓取并发数:", 'FANS_FETCH_CONCURRENCY', width=100) # 守护进程间隔 ctk.CTkLabel(scroll_frame, text="守护进程配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(15, 5)) if self.crawler_type == 'commerce': self._add_entry(scroll_frame, "抓取间隔(小时):", 'DAEMON_INTERVAL_HOURS', width=100) else: self._add_entry(scroll_frame, "无粉丝抓取间隔(小时):", 'DAEMON_INTERVAL_NO_FANS_HOURS', width=100) self._add_entry(scroll_frame, "含粉丝抓取间隔(小时):", 'DAEMON_INTERVAL_WITH_FANS_HOURS', width=100) # 按钮区域 btn_frame = ctk.CTkFrame(self) btn_frame.pack(fill="x", padx=10, pady=10) ctk.CTkButton(btn_frame, text="保存", width=80, command=self.on_save).pack(side="right", padx=5) ctk.CTkButton(btn_frame, text="取消", width=80, command=self.destroy).pack(side="right", padx=5) def _add_entry(self, parent, label, key, width=200): """添加配置项""" frame = ctk.CTkFrame(parent, fg_color="transparent") frame.pack(fill="x", pady=3) ctk.CTkLabel(frame, text=label, width=150, anchor="w").pack(side="left") entry = ctk.CTkEntry(frame, width=width) entry.pack(side="left", padx=5) value = self.config_values.get(key, '') entry.insert(0, str(value) if value else '') self.entries[key] = entry def _add_global_entry(self, parent, label, key, width=200, show=None): """添加全局配置项""" frame = ctk.CTkFrame(parent, fg_color="transparent") frame.pack(fill="x", pady=3) ctk.CTkLabel(frame, text=label, width=150, anchor="w").pack(side="left") entry = ctk.CTkEntry(frame, width=width, show=show) entry.pack(side="left", padx=5) value = self.global_config.get(key, '') entry.insert(0, str(value) if value else '') self.global_entries[key] = entry def on_save(self): """保存配置""" # 保存全局上传配置 new_global_config = {} for key, entry in self.global_entries.items(): new_global_config[key] = entry.get().strip() if global_config_manager.save_config(new_global_config): messagebox.showinfo("提示", "配置已保存。\n注意:部分爬虫配置需要重启程序才能生效。") else: messagebox.showerror("错误", "保存配置失败") return if self.on_save_callback: self.on_save_callback() self.destroy() class LogHandler: """日志处理器,将日志重定向到 GUI""" def __init__(self, text_widget, log_queue): self.text_widget = text_widget self.log_queue = log_queue def write(self, message): if message.strip(): self.log_queue.put(message) def flush(self): pass class CrawlerTab(ctk.CTkFrame): """爬虫 Tab 基类""" def __init__(self, parent, crawler_type, log_queue): super().__init__(parent) self.crawler_type = crawler_type self.log_queue = log_queue self.running_thread = None self.stop_flag = threading.Event() self.is_logged_in = False self.is_running = False # 是否有任务正在运行 self.setup_ui() self.check_cookie_status() def setup_ui(self): # Cookie 状态区域 status_frame = ctk.CTkFrame(self) status_frame.pack(fill="x", padx=10, pady=10) self.cookie_label = ctk.CTkLabel( status_frame, text="Cookie 状态: 检查中...", font=ctk.CTkFont(size=14) ) self.cookie_label.pack(side="left", padx=10, pady=5) # 按钮区域 btn_frame = ctk.CTkFrame(self) btn_frame.pack(fill="x", padx=10, pady=5) self.login_btn = ctk.CTkButton( btn_frame, text="登录", width=80, command=self.on_login ) self.login_btn.pack(side="left", padx=5, pady=5) self.logout_btn = ctk.CTkButton( btn_frame, text="退出登录", width=80, fg_color="orange", hover_color="darkorange", command=self.on_logout, state="disabled" ) self.logout_btn.pack(side="left", padx=5, pady=5) self.upload_btn = ctk.CTkButton( btn_frame, text="上传Cookie", width=90, fg_color="#28a745", hover_color="#218838", command=self.on_upload_cookie, state="disabled" ) self.upload_btn.pack(side="left", padx=5, pady=5) self.status_btn = ctk.CTkButton( btn_frame, text="查看状态", width=80, command=self.on_status, state="disabled" ) self.status_btn.pack(side="left", padx=5, pady=5) self.crawl_btn = ctk.CTkButton( btn_frame, text="单次抓取", width=80, command=self.on_crawl, state="disabled" ) self.crawl_btn.pack(side="left", padx=5, pady=5) self.daemon_btn = ctk.CTkButton( btn_frame, text="启动守护", width=80, command=self.on_daemon, state="disabled" ) self.daemon_btn.pack(side="left", padx=5, pady=5) self.stop_btn = ctk.CTkButton( btn_frame, text="停止", width=80, fg_color="red", hover_color="darkred", command=self.on_stop, state="disabled" ) self.stop_btn.pack(side="left", padx=5, pady=5) self.config_btn = ctk.CTkButton( btn_frame, text="配置", width=80, fg_color="gray", hover_color="darkgray", command=self.on_config ) self.config_btn.pack(side="left", padx=5, pady=5) # 参数设置区域 params_frame = ctk.CTkFrame(self) params_frame.pack(fill="x", padx=10, pady=5) self.setup_params(params_frame) def setup_params(self, parent): """子类重写此方法设置参数""" pass def log(self, message): """写入日志""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.log_queue.put(f"[{timestamp}] [{self.crawler_type}] {message}\n") def check_cookie_status(self): """检查 Cookie 状态""" try: cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager') CookieManager = cookie_module.CookieManager cookie_manager = CookieManager() if cookie_manager.is_cookie_exists(): cookie_data = cookie_manager.load_cookies() if cookie_data: metadata = cookie_data.get("metadata", {}) username = metadata.get("username", "未知") self.cookie_label.configure( text=f"Cookie 状态: 已登录 ({username})", text_color="green" ) self.is_logged_in = True self.update_buttons_state() return self.cookie_label.configure( text="Cookie 状态: 未登录", text_color="red" ) self.is_logged_in = False self.update_buttons_state() except Exception as e: self.cookie_label.configure( text=f"Cookie 状态: 检查失败 ({e})", text_color="orange" ) self.is_logged_in = False self.update_buttons_state() def update_buttons_state(self): """根据登录状态和运行状态更新按钮""" if self.is_running: # 任务运行中:只启用停止按钮 self.login_btn.configure(state="disabled") self.logout_btn.configure(state="disabled") self.upload_btn.configure(state="disabled") self.status_btn.configure(state="disabled") self.crawl_btn.configure(state="disabled") self.daemon_btn.configure(state="disabled") self.stop_btn.configure(state="normal") self.config_btn.configure(state="disabled") elif self.is_logged_in: # 已登录且未运行:禁用登录,启用其他功能 self.login_btn.configure(state="disabled") self.logout_btn.configure(state="normal") self.upload_btn.configure(state="normal") self.status_btn.configure(state="normal") self.crawl_btn.configure(state="normal") self.daemon_btn.configure(state="normal") self.stop_btn.configure(state="disabled") self.config_btn.configure(state="normal") else: # 未登录:只启用登录按钮 self.login_btn.configure(state="normal") self.logout_btn.configure(state="disabled") self.upload_btn.configure(state="disabled") self.status_btn.configure(state="disabled") self.crawl_btn.configure(state="disabled") self.daemon_btn.configure(state="disabled") self.stop_btn.configure(state="disabled") self.config_btn.configure(state="normal") def on_login(self): """登录""" self.log("正在启动浏览器登录...") self.is_running = True self.update_buttons_state() def run(): try: browser_module = import_with_crawler_context(self.crawler_type, 'core.browser_login') BrowserLogin = browser_module.BrowserLogin browser_login = BrowserLogin() cookies = browser_login.trigger_login() if cookies: self.log("登录成功!") else: self.log("登录失败或被取消") except Exception as e: self.log(f"登录出错: {e}") finally: # 确保在主线程更新UI self.after(0, self._on_login_complete) self.running_thread = threading.Thread(target=run, daemon=True) self.running_thread.start() def _on_login_complete(self): """登录完成后的处理""" self.is_running = False self.check_cookie_status() def on_logout(self): """退出登录""" if messagebox.askyesno("确认", "确定要退出登录吗?\n这将清除保存的Cookie。"): try: cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager') CookieManager = cookie_module.CookieManager cookie_manager = CookieManager() cookie_file = cookie_manager.cookie_file if cookie_file.exists(): cookie_file.unlink() self.log("已退出登录,Cookie 已清除") else: self.log("Cookie 文件不存在") self.check_cookie_status() except Exception as e: self.log(f"退出登录失败: {e}") def on_config(self): """打开配置对话框""" ConfigDialog(self.winfo_toplevel(), self.crawler_type) def on_status(self): """查看状态""" try: cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager') CookieManager = cookie_module.CookieManager cookie_manager = CookieManager() if cookie_manager.is_cookie_exists(): cookie_data = cookie_manager.load_cookies() if cookie_data: metadata = cookie_data.get("metadata", {}) cookies = cookie_data.get("cookies", []) info = f"""Cookie 状态信息: 文件路径: {cookie_manager.cookie_file} Cookie 数量: {len(cookies)} 用户名: {metadata.get('username', '未知')} 创建时间: {metadata.get('created_at', '未知')} 最后验证: {metadata.get('last_validated', '未知')} 登录URL: {metadata.get('url', '未知')}""" self.log(info) else: self.log("Cookie 文件存在但加载失败") else: self.log("Cookie 文件不存在,请先登录") self.check_cookie_status() except Exception as e: self.log(f"查看状态出错: {e}") def on_upload_cookie(self): """上传 Cookie 到服务器""" # 检查配置 config = global_config_manager.load_config() server_url = config.get('serverUrl', '').strip() api_key = config.get('apiKey', '').strip() if not server_url or not api_key: messagebox.showwarning("配置缺失", "请先在配置中设置服务器地址和 API Key") return self.log("正在上传 Cookie 到服务器...") self.set_running_state(True) def run(): try: # 获取 cookie 数据 cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager') CookieManager = cookie_module.CookieManager cookie_manager = CookieManager() if not cookie_manager.is_cookie_exists(): self.log("Cookie 文件不存在,请先登录") return cookie_data = cookie_manager.load_cookies() if not cookie_data: self.log("加载 Cookie 失败") return cookies = cookie_data.get("cookies", []) if not cookies: self.log("Cookie 为空") return # 将 cookie 列表转换为字符串格式:name=value; name2=value2 cookie_string = '; '.join([f"{c['name']}={c['value']}" for c in cookies]) # 计算 MD5 md5_hash = hashlib.md5(cookie_string.encode('utf-8')).hexdigest() # 获取平台配置 platform = PLATFORMS.get(self.crawler_type, {}) login_url = platform.get('loginUrl', '') # 上传到服务器 upload_url = f"{server_url}/api/crawler/cookie/upload" payload = { 'code': self.crawler_type, 'cookieString': cookie_string, 'md5': md5_hash, 'url': login_url } headers = { 'X-API-Key': api_key, 'Content-Type': 'application/json' } self.log(f"正在上传到: {upload_url}") response = requests.post(upload_url, json=payload, headers=headers, timeout=30) if response.status_code != 200: self.log(f"上传失败: HTTP {response.status_code} - {response.text}") return result = response.json() if result.get('code') == 200: self.log(f"Cookie 上传成功!") if result.get('data'): self.log(f"服务器返回: {result.get('data')}") else: self.log(f"上传失败: {result.get('msg', '未知错误')}") except requests.exceptions.Timeout: self.log("上传超时,请检查网络连接") except requests.exceptions.ConnectionError: self.log("连接服务器失败,请检查服务器地址") except Exception as e: self.log(f"上传出错: {e}") finally: self.after(0, lambda: self.set_running_state(False)) self.running_thread = threading.Thread(target=run, daemon=True) self.running_thread.start() def on_crawl(self): """单次抓取 - 子类重写""" pass def on_daemon(self): """守护进程 - 子类重写""" pass def on_stop(self): """停止""" self.stop_flag.set() self.log("正在停止...") # 给一个短暂的延迟后检查任务是否已停止 self.after(1000, self._check_task_stopped) def _check_task_stopped(self): """检查任务是否已停止""" if self.running_thread and self.running_thread.is_alive(): # 任务仍在运行,继续等待 self.after(1000, self._check_task_stopped) else: # 任务已停止 self.is_running = False self.update_buttons_state() self.log("任务已停止") def set_running_state(self, running): """设置运行状态""" self.is_running = running if running: self.stop_flag.clear() self.update_buttons_state() class CommerceTab(CrawlerTab): """巨量百应 Tab""" def __init__(self, parent, log_queue): super().__init__(parent, "commerce", log_queue) def setup_params(self, parent): # 每页数量 ctk.CTkLabel(parent, text="每页数量:").pack(side="left", padx=(10, 5), pady=5) self.page_size_var = ctk.StringVar(value="20") self.page_size_entry = ctk.CTkEntry(parent, width=60, textvariable=self.page_size_var) self.page_size_entry.pack(side="left", padx=5, pady=5) # 间隔时间 ctk.CTkLabel(parent, text="抓取间隔(小时):").pack(side="left", padx=(20, 5), pady=5) self.interval_var = ctk.StringVar(value="1") self.interval_entry = ctk.CTkEntry(parent, width=60, textvariable=self.interval_var) self.interval_entry.pack(side="left", padx=5, pady=5) def on_crawl(self): """单次抓取""" self.log("开始抓取达人账号数据...") self.set_running_state(True) def run(): try: service_module = import_with_crawler_context('commerce', 'services.daren_account_service') DarenAccountService = service_module.DarenAccountService page_size = int(self.page_size_var.get()) service = DarenAccountService() filepath = service.crawl(page_size=page_size, status=1) if filepath: self.log(f"抓取完成! 文件: {filepath}") else: self.log("抓取失败") except Exception as e: self.log(f"抓取出错: {e}") finally: self.after(0, lambda: self.set_running_state(False)) self.running_thread = threading.Thread(target=run, daemon=True) self.running_thread.start() def on_daemon(self): """守护进程模式""" self.log("启动守护进程模式...") self.set_running_state(True) def run(): try: import time service_module = import_with_crawler_context('commerce', 'services.daren_account_service') DarenAccountService = service_module.DarenAccountService page_size = int(self.page_size_var.get()) interval_hours = float(self.interval_var.get()) interval_seconds = interval_hours * 3600 self.log(f"守护进程已启动 - 间隔: {interval_hours} 小时") cycle = 0 while not self.stop_flag.is_set(): cycle += 1 self.log(f"第 {cycle} 次抓取开始...") try: service = DarenAccountService() filepath = service.crawl(page_size=page_size, status=1) if filepath: self.log(f"第 {cycle} 次抓取完成: {filepath.name}") else: self.log(f"第 {cycle} 次抓取失败") except Exception as e: self.log(f"第 {cycle} 次抓取出错: {e}") # 等待下次抓取 self.log(f"等待 {interval_hours} 小时后进行下次抓取...") # 分段等待,以便响应停止信号 waited = 0 while waited < interval_seconds and not self.stop_flag.is_set(): time.sleep(min(30, interval_seconds - waited)) waited += 30 self.log("守护进程已停止") except Exception as e: self.log(f"守护进程出错: {e}") finally: self.after(0, lambda: self.set_running_state(False)) self.running_thread = threading.Thread(target=run, daemon=True) self.running_thread.start() class EntertainmentTab(CrawlerTab): """娱乐版 Tab""" def __init__(self, parent, log_queue): super().__init__(parent, "entertainment", log_queue) def setup_params(self, parent): # 每页数量 ctk.CTkLabel(parent, text="每页数量:").pack(side="left", padx=(10, 5), pady=5) self.size_var = ctk.StringVar(value="10") self.size_entry = ctk.CTkEntry(parent, width=60, textvariable=self.size_var) self.size_entry.pack(side="left", padx=5, pady=5) # 抓取粉丝 self.fetch_fans_var = ctk.BooleanVar(value=True) self.fetch_fans_cb = ctk.CTkCheckBox( parent, text="抓取粉丝数据", variable=self.fetch_fans_var ) self.fetch_fans_cb.pack(side="left", padx=20, pady=5) # 间隔时间 ctk.CTkLabel(parent, text="抓取间隔(小时):").pack(side="left", padx=(20, 5), pady=5) self.interval_var = ctk.StringVar(value="1") self.interval_entry = ctk.CTkEntry(parent, width=60, textvariable=self.interval_var) self.interval_entry.pack(side="left", padx=5, pady=5) def on_crawl(self): """单次抓取""" self.log("开始抓取排行榜数据...") self.set_running_state(True) def run(): try: service_module = import_with_crawler_context('entertainment', 'services.crawler_service') CrawlerService = service_module.CrawlerService size = int(self.size_var.get()) fetch_fans = self.fetch_fans_var.get() service = CrawlerService() filepath = service.crawl( rank_type=0, size=size, filter_type='anchor', fetch_fans=fetch_fans ) if filepath: self.log(f"抓取完成! 文件: {filepath}") else: self.log("抓取失败") except Exception as e: self.log(f"抓取出错: {e}") finally: self.after(0, lambda: self.set_running_state(False)) self.running_thread = threading.Thread(target=run, daemon=True) self.running_thread.start() def on_daemon(self): """守护进程模式""" self.log("启动守护进程模式...") self.set_running_state(True) def run(): try: import time service_module = import_with_crawler_context('entertainment', 'services.crawler_service') CrawlerService = service_module.CrawlerService size = int(self.size_var.get()) fetch_fans = self.fetch_fans_var.get() interval_hours = float(self.interval_var.get()) interval_seconds = interval_hours * 3600 self.log(f"守护进程已启动 - 间隔: {interval_hours} 小时") cycle = 0 while not self.stop_flag.is_set(): cycle += 1 self.log(f"第 {cycle} 次抓取开始...") try: service = CrawlerService() filepath = service.crawl( rank_type=0, size=size, filter_type='anchor', fetch_fans=fetch_fans ) if filepath: self.log(f"第 {cycle} 次抓取完成: {filepath.name}") else: self.log(f"第 {cycle} 次抓取失败") except Exception as e: self.log(f"第 {cycle} 次抓取出错: {e}") # 等待下次抓取 self.log(f"等待 {interval_hours} 小时后进行下次抓取...") # 分段等待 waited = 0 while waited < interval_seconds and not self.stop_flag.is_set(): time.sleep(min(30, interval_seconds - waited)) waited += 30 self.log("守护进程已停止") except Exception as e: self.log(f"守护进程出错: {e}") finally: self.after(0, lambda: self.set_running_state(False)) self.running_thread = threading.Thread(target=run, daemon=True) self.running_thread.start() class App(ctk.CTk): """主应用程序""" # 日志最大行数,超过后自动截断 MAX_LOG_LINES = 1000 def __init__(self): super().__init__() self.title("字节直播数据采集工具") self.geometry("900x650") self.minsize(800, 550) # 日志队列 self.log_queue = queue.Queue() # 重定向标准输出到日志 self.setup_stdout_redirect() self.setup_ui() self.process_log_queue() def setup_stdout_redirect(self): """重定向标准输出和错误输出到日志队列""" class StdoutRedirector: def __init__(self, log_queue, original_stdout): self.log_queue = log_queue self.original_stdout = original_stdout self.buffer = "" def write(self, message): # 同时输出到原始stdout(控制台) if self.original_stdout: try: self.original_stdout.write(message) except: pass # 处理消息,按行分割 if message: self.buffer += message while '\n' in self.buffer: line, self.buffer = self.buffer.split('\n', 1) if line.strip(): self.log_queue.put(f"{line}\n") def flush(self): if self.original_stdout: try: self.original_stdout.flush() except: pass # 刷新缓冲区中剩余的内容 if self.buffer.strip(): self.log_queue.put(f"{self.buffer}\n") self.buffer = "" # 保存原始的stdout和stderr self.original_stdout = sys.stdout self.original_stderr = sys.stderr # 重定向 sys.stdout = StdoutRedirector(self.log_queue, self.original_stdout) sys.stderr = StdoutRedirector(self.log_queue, self.original_stderr) def setup_ui(self): # 标题 title_label = ctk.CTkLabel( self, text="直播数据采集工具", font=ctk.CTkFont(size=20, weight="bold") ) title_label.pack(pady=10) # Tab 视图 self.tabview = ctk.CTkTabview(self) self.tabview.pack(fill="both", expand=True, padx=10, pady=5) # 添加 Tab self.tabview.add("巨量百应") self.tabview.add("字节联盟-主播排行") # 创建 Tab 内容 self.commerce_tab = CommerceTab( self.tabview.tab("巨量百应"), self.log_queue ) self.commerce_tab.pack(fill="both", expand=True) self.entertainment_tab = EntertainmentTab( self.tabview.tab("字节联盟-主播排行"), self.log_queue ) self.entertainment_tab.pack(fill="both", expand=True) # 设置默认选中的 Tab self.tabview.set("字节联盟-主播排行") # 日志区域 log_frame = ctk.CTkFrame(self) log_frame.pack(fill="both", expand=True, padx=10, pady=10) # 日志标题行 log_header = ctk.CTkFrame(log_frame, fg_color="transparent") log_header.pack(fill="x", padx=5, pady=(5, 0)) log_label = ctk.CTkLabel(log_header, text="运行日志:", anchor="w") log_label.pack(side="left") # 日志行数显示 self.log_count_label = ctk.CTkLabel(log_header, text="(0 行)", anchor="w", text_color="gray") self.log_count_label.pack(side="left", padx=5) self.log_text = ctk.CTkTextbox(log_frame, height=150) self.log_text.pack(fill="both", expand=True, padx=5, pady=5) # 按钮区域 btn_frame = ctk.CTkFrame(log_frame, fg_color="transparent") btn_frame.pack(fill="x", padx=5, pady=5) # 左侧按钮 clear_btn = ctk.CTkButton( btn_frame, text="清空日志", width=80, command=self.clear_log ) clear_btn.pack(side="left", padx=5) # 右侧按钮 exit_btn = ctk.CTkButton( btn_frame, text="退出程序", width=80, fg_color="red", hover_color="darkred", command=self.on_exit ) exit_btn.pack(side="right", padx=5) # 初始日志 self.log("程序启动完成,请选择爬虫类型并操作") def log(self, message): """写入日志""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.log_queue.put(f"[{timestamp}] {message}\n") def clear_log(self): """清空日志""" self.log_text.delete("1.0", "end") self.update_log_count() def update_log_count(self): """更新日志行数显示""" try: # 获取行数 content = self.log_text.get("1.0", "end-1c") line_count = content.count('\n') + 1 if content else 0 self.log_count_label.configure(text=f"({line_count} 行)") except: pass def truncate_log_if_needed(self): """如果日志超过最大行数,截断旧日志""" try: content = self.log_text.get("1.0", "end-1c") lines = content.split('\n') if len(lines) > self.MAX_LOG_LINES: # 保留最新的日志 keep_lines = lines[-self.MAX_LOG_LINES:] self.log_text.delete("1.0", "end") self.log_text.insert("1.0", '\n'.join(keep_lines)) # 添加截断提示 self.log_text.insert("1.0", f"[日志已截断,保留最新 {self.MAX_LOG_LINES} 行]\n") except: pass def process_log_queue(self): """处理日志队列""" message_count = 0 try: while True: message = self.log_queue.get_nowait() self.log_text.insert("end", message) message_count += 1 # 每处理50条消息检查一次是否需要截断 if message_count % 50 == 0: self.truncate_log_if_needed() except queue.Empty: pass if message_count > 0: self.log_text.see("end") self.update_log_count() # 处理完一批消息后检查截断 self.truncate_log_if_needed() # 每 100ms 检查一次 self.after(100, self.process_log_queue) def on_exit(self): """退出程序""" if messagebox.askyesno("确认", "确定要退出程序吗?"): # 恢复标准输出 sys.stdout = self.original_stdout sys.stderr = self.original_stderr self.destroy() def main(): app = App() app.mainloop() if __name__ == "__main__": main()