live-forum/server/crawler/gui/app.py
2026-03-24 11:27:37 +08:00

1092 lines
38 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
字节直播数据采集工具 - GUI 版本
整合 commerce 和 entertainment 两个爬虫
"""
import sys
import os
import threading
import queue
import json
import hashlib
import requests
from datetime import datetime
from pathlib import Path
# 处理打包后的路径
if getattr(sys, 'frozen', False):
# 打包后运行
BASE_DIR = Path(sys._MEIPASS)
GUI_DIR = BASE_DIR
CRAWLER_DIR = BASE_DIR
# 设置 Playwright 浏览器路径
playwright_path = BASE_DIR / 'ms-playwright'
if playwright_path.exists():
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = str(playwright_path)
else:
# 开发模式运行
GUI_DIR = Path(__file__).parent
CRAWLER_DIR = GUI_DIR.parent
# 添加项目路径
sys.path.insert(0, str(CRAWLER_DIR))
import customtkinter as ctk
from tkinter import messagebox
import importlib
def import_with_crawler_context(crawler_type: str, module_path: str):
"""
在指定爬虫的上下文中导入模块
解决 commerce 和 entertainment 都有同名 config/core/utils 模块的冲突问题
Args:
crawler_type: 'commerce''entertainment'
module_path: 要导入的模块路径,如 'core.browser_login'
Returns:
导入的模块
"""
crawler_path = str(CRAWLER_DIR / crawler_type)
# 清除可能冲突的缓存模块
conflicting_prefixes = ('config', 'core', 'utils', 'services')
modules_to_remove = [
name for name in sys.modules.keys()
if any(name == prefix or name.startswith(prefix + '.') for prefix in conflicting_prefixes)
]
for mod_name in modules_to_remove:
del sys.modules[mod_name]
# 确保爬虫路径在 sys.path 最前面
if crawler_path in sys.path:
sys.path.remove(crawler_path)
sys.path.insert(0, crawler_path)
# 导入模块(不再在 finally 中移除路径,因为模块运行时可能还需要)
return importlib.import_module(module_path)
# 设置外观
ctk.set_appearance_mode("dark")
ctk.set_default_color_theme("blue")
# 平台配置(与浏览器插件保持一致)
PLATFORMS = {
'commerce': {
'name': '巨量百应-达人账号',
'loginUrl': 'https://buyin.jinritemai.com/mpa/account/institution-role-select'
},
'entertainment': {
'name': '字节联盟-主播排行',
'loginUrl': 'https://union.bytedance.com/open/portal/data/leaderboard?appId=3000'
}
}
class GlobalConfigManager:
"""全局配置管理器,用于管理上传服务器配置"""
DEFAULT_CONFIG = {
'serverUrl': 'https://admin.api.skzhijia.com',
'apiKey': 'sk_7A353DEF2BFD4EADA4BF364CCC7C8FDE2AAD1'
}
def __init__(self):
# 配置文件保存在 gui 目录下
if getattr(sys, 'frozen', False):
self.config_file = Path(sys.executable).parent / 'upload_config.json'
else:
self.config_file = GUI_DIR / 'upload_config.json'
def load_config(self) -> dict:
"""加载配置"""
if self.config_file.exists():
try:
with open(self.config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
# 合并默认配置
return {**self.DEFAULT_CONFIG, **config}
except Exception:
pass
return self.DEFAULT_CONFIG.copy()
def save_config(self, config: dict) -> bool:
"""保存配置"""
try:
with open(self.config_file, 'w', encoding='utf-8') as f:
json.dump(config, f, ensure_ascii=False, indent=2)
return True
except Exception:
return False
# 全局配置管理器实例
global_config_manager = GlobalConfigManager()
class ConfigDialog(ctk.CTkToplevel):
"""配置对话框"""
def __init__(self, parent, crawler_type, on_save_callback=None):
super().__init__(parent)
self.crawler_type = crawler_type
self.on_save_callback = on_save_callback
self.title(f"配置 - {crawler_type}")
self.geometry("500x480")
self.resizable(False, False)
# 模态对话框
self.transient(parent)
self.grab_set()
self.load_config()
self.setup_ui()
# 居中显示
self.update_idletasks()
x = (self.winfo_screenwidth() - self.winfo_width()) // 2
y = (self.winfo_screenheight() - self.winfo_height()) // 2
self.geometry(f"+{x}+{y}")
def load_config(self):
"""加载配置"""
try:
config_module = import_with_crawler_context(self.crawler_type, 'config.settings')
self.config_values = {
'REPORT_URL': getattr(config_module, 'REPORT_URL', ''),
'REQUEST_TIMEOUT': getattr(config_module, 'REQUEST_TIMEOUT', 30),
'BROWSER_TIMEOUT': getattr(config_module, 'BROWSER_TIMEOUT', 300),
'FANS_FETCH_CONCURRENCY': getattr(config_module, 'FANS_FETCH_CONCURRENCY', 5),
}
if self.crawler_type == 'commerce':
self.config_values['DAEMON_INTERVAL_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_HOURS', 1)
else:
self.config_values['DAEMON_INTERVAL_NO_FANS_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_NO_FANS_HOURS', 1)
self.config_values['DAEMON_INTERVAL_WITH_FANS_HOURS'] = getattr(config_module, 'DAEMON_INTERVAL_WITH_FANS_HOURS', 6)
except Exception as e:
self.config_values = {}
messagebox.showerror("错误", f"加载配置失败: {e}")
# 加载全局上传配置
self.global_config = global_config_manager.load_config()
def setup_ui(self):
"""设置UI"""
# 滚动容器
scroll_frame = ctk.CTkScrollableFrame(self)
scroll_frame.pack(fill="both", expand=True, padx=10, pady=10)
self.entries = {}
self.global_entries = {}
# Cookie 上传配置(全局配置)
ctk.CTkLabel(scroll_frame, text="Cookie 上传配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(0, 5))
# 服务器地址
self._add_global_entry(scroll_frame, "服务器地址:", 'serverUrl', width=400)
# API Key
self._add_global_entry(scroll_frame, "API Key:", 'apiKey', width=400, show="*")
# 通用配置
ctk.CTkLabel(scroll_frame, text="爬虫配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(15, 5))
# 数据上报URL
self._add_entry(scroll_frame, "数据上报URL:", 'REPORT_URL', width=400)
# 请求超时
self._add_entry(scroll_frame, "请求超时(秒):", 'REQUEST_TIMEOUT', width=100)
# 浏览器登录超时
self._add_entry(scroll_frame, "登录超时(秒):", 'BROWSER_TIMEOUT', width=100)
# 粉丝抓取并发数
self._add_entry(scroll_frame, "粉丝抓取并发数:", 'FANS_FETCH_CONCURRENCY', width=100)
# 守护进程间隔
ctk.CTkLabel(scroll_frame, text="守护进程配置", font=ctk.CTkFont(weight="bold")).pack(anchor="w", pady=(15, 5))
if self.crawler_type == 'commerce':
self._add_entry(scroll_frame, "抓取间隔(小时):", 'DAEMON_INTERVAL_HOURS', width=100)
else:
self._add_entry(scroll_frame, "无粉丝抓取间隔(小时):", 'DAEMON_INTERVAL_NO_FANS_HOURS', width=100)
self._add_entry(scroll_frame, "含粉丝抓取间隔(小时):", 'DAEMON_INTERVAL_WITH_FANS_HOURS', width=100)
# 按钮区域
btn_frame = ctk.CTkFrame(self)
btn_frame.pack(fill="x", padx=10, pady=10)
ctk.CTkButton(btn_frame, text="保存", width=80, command=self.on_save).pack(side="right", padx=5)
ctk.CTkButton(btn_frame, text="取消", width=80, command=self.destroy).pack(side="right", padx=5)
def _add_entry(self, parent, label, key, width=200):
"""添加配置项"""
frame = ctk.CTkFrame(parent, fg_color="transparent")
frame.pack(fill="x", pady=3)
ctk.CTkLabel(frame, text=label, width=150, anchor="w").pack(side="left")
entry = ctk.CTkEntry(frame, width=width)
entry.pack(side="left", padx=5)
value = self.config_values.get(key, '')
entry.insert(0, str(value) if value else '')
self.entries[key] = entry
def _add_global_entry(self, parent, label, key, width=200, show=None):
"""添加全局配置项"""
frame = ctk.CTkFrame(parent, fg_color="transparent")
frame.pack(fill="x", pady=3)
ctk.CTkLabel(frame, text=label, width=150, anchor="w").pack(side="left")
entry = ctk.CTkEntry(frame, width=width, show=show)
entry.pack(side="left", padx=5)
value = self.global_config.get(key, '')
entry.insert(0, str(value) if value else '')
self.global_entries[key] = entry
def on_save(self):
"""保存配置"""
# 保存全局上传配置
new_global_config = {}
for key, entry in self.global_entries.items():
new_global_config[key] = entry.get().strip()
if global_config_manager.save_config(new_global_config):
messagebox.showinfo("提示", "配置已保存。\n注意:部分爬虫配置需要重启程序才能生效。")
else:
messagebox.showerror("错误", "保存配置失败")
return
if self.on_save_callback:
self.on_save_callback()
self.destroy()
class LogHandler:
"""日志处理器,将日志重定向到 GUI"""
def __init__(self, text_widget, log_queue):
self.text_widget = text_widget
self.log_queue = log_queue
def write(self, message):
if message.strip():
self.log_queue.put(message)
def flush(self):
pass
class CrawlerTab(ctk.CTkFrame):
"""爬虫 Tab 基类"""
def __init__(self, parent, crawler_type, log_queue):
super().__init__(parent)
self.crawler_type = crawler_type
self.log_queue = log_queue
self.running_thread = None
self.stop_flag = threading.Event()
self.is_logged_in = False
self.is_running = False # 是否有任务正在运行
self.setup_ui()
self.check_cookie_status()
def setup_ui(self):
# Cookie 状态区域
status_frame = ctk.CTkFrame(self)
status_frame.pack(fill="x", padx=10, pady=10)
self.cookie_label = ctk.CTkLabel(
status_frame,
text="Cookie 状态: 检查中...",
font=ctk.CTkFont(size=14)
)
self.cookie_label.pack(side="left", padx=10, pady=5)
# 按钮区域
btn_frame = ctk.CTkFrame(self)
btn_frame.pack(fill="x", padx=10, pady=5)
self.login_btn = ctk.CTkButton(
btn_frame, text="登录", width=80,
command=self.on_login
)
self.login_btn.pack(side="left", padx=5, pady=5)
self.logout_btn = ctk.CTkButton(
btn_frame, text="退出登录", width=80,
fg_color="orange", hover_color="darkorange",
command=self.on_logout,
state="disabled"
)
self.logout_btn.pack(side="left", padx=5, pady=5)
self.upload_btn = ctk.CTkButton(
btn_frame, text="上传Cookie", width=90,
fg_color="#28a745", hover_color="#218838",
command=self.on_upload_cookie,
state="disabled"
)
self.upload_btn.pack(side="left", padx=5, pady=5)
self.status_btn = ctk.CTkButton(
btn_frame, text="查看状态", width=80,
command=self.on_status,
state="disabled"
)
self.status_btn.pack(side="left", padx=5, pady=5)
self.crawl_btn = ctk.CTkButton(
btn_frame, text="单次抓取", width=80,
command=self.on_crawl,
state="disabled"
)
self.crawl_btn.pack(side="left", padx=5, pady=5)
self.daemon_btn = ctk.CTkButton(
btn_frame, text="启动守护", width=80,
command=self.on_daemon,
state="disabled"
)
self.daemon_btn.pack(side="left", padx=5, pady=5)
self.stop_btn = ctk.CTkButton(
btn_frame, text="停止", width=80,
fg_color="red", hover_color="darkred",
command=self.on_stop,
state="disabled"
)
self.stop_btn.pack(side="left", padx=5, pady=5)
self.config_btn = ctk.CTkButton(
btn_frame, text="配置", width=80,
fg_color="gray", hover_color="darkgray",
command=self.on_config
)
self.config_btn.pack(side="left", padx=5, pady=5)
# 参数设置区域
params_frame = ctk.CTkFrame(self)
params_frame.pack(fill="x", padx=10, pady=5)
self.setup_params(params_frame)
def setup_params(self, parent):
"""子类重写此方法设置参数"""
pass
def log(self, message):
"""写入日志"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
self.log_queue.put(f"[{timestamp}] [{self.crawler_type}] {message}\n")
def check_cookie_status(self):
"""检查 Cookie 状态"""
try:
cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager')
CookieManager = cookie_module.CookieManager
cookie_manager = CookieManager()
if cookie_manager.is_cookie_exists():
cookie_data = cookie_manager.load_cookies()
if cookie_data:
metadata = cookie_data.get("metadata", {})
username = metadata.get("username", "未知")
self.cookie_label.configure(
text=f"Cookie 状态: 已登录 ({username})",
text_color="green"
)
self.is_logged_in = True
self.update_buttons_state()
return
self.cookie_label.configure(
text="Cookie 状态: 未登录",
text_color="red"
)
self.is_logged_in = False
self.update_buttons_state()
except Exception as e:
self.cookie_label.configure(
text=f"Cookie 状态: 检查失败 ({e})",
text_color="orange"
)
self.is_logged_in = False
self.update_buttons_state()
def update_buttons_state(self):
"""根据登录状态和运行状态更新按钮"""
if self.is_running:
# 任务运行中:只启用停止按钮
self.login_btn.configure(state="disabled")
self.logout_btn.configure(state="disabled")
self.upload_btn.configure(state="disabled")
self.status_btn.configure(state="disabled")
self.crawl_btn.configure(state="disabled")
self.daemon_btn.configure(state="disabled")
self.stop_btn.configure(state="normal")
self.config_btn.configure(state="disabled")
elif self.is_logged_in:
# 已登录且未运行:禁用登录,启用其他功能
self.login_btn.configure(state="disabled")
self.logout_btn.configure(state="normal")
self.upload_btn.configure(state="normal")
self.status_btn.configure(state="normal")
self.crawl_btn.configure(state="normal")
self.daemon_btn.configure(state="normal")
self.stop_btn.configure(state="disabled")
self.config_btn.configure(state="normal")
else:
# 未登录:只启用登录按钮
self.login_btn.configure(state="normal")
self.logout_btn.configure(state="disabled")
self.upload_btn.configure(state="disabled")
self.status_btn.configure(state="disabled")
self.crawl_btn.configure(state="disabled")
self.daemon_btn.configure(state="disabled")
self.stop_btn.configure(state="disabled")
self.config_btn.configure(state="normal")
def on_login(self):
"""登录"""
self.log("正在启动浏览器登录...")
self.is_running = True
self.update_buttons_state()
def run():
try:
browser_module = import_with_crawler_context(self.crawler_type, 'core.browser_login')
BrowserLogin = browser_module.BrowserLogin
browser_login = BrowserLogin()
cookies = browser_login.trigger_login()
if cookies:
self.log("登录成功!")
else:
self.log("登录失败或被取消")
except Exception as e:
self.log(f"登录出错: {e}")
finally:
# 确保在主线程更新UI
self.after(0, self._on_login_complete)
self.running_thread = threading.Thread(target=run, daemon=True)
self.running_thread.start()
def _on_login_complete(self):
"""登录完成后的处理"""
self.is_running = False
self.check_cookie_status()
def on_logout(self):
"""退出登录"""
if messagebox.askyesno("确认", "确定要退出登录吗?\n这将清除保存的Cookie。"):
try:
cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager')
CookieManager = cookie_module.CookieManager
cookie_manager = CookieManager()
cookie_file = cookie_manager.cookie_file
if cookie_file.exists():
cookie_file.unlink()
self.log("已退出登录Cookie 已清除")
else:
self.log("Cookie 文件不存在")
self.check_cookie_status()
except Exception as e:
self.log(f"退出登录失败: {e}")
def on_config(self):
"""打开配置对话框"""
ConfigDialog(self.winfo_toplevel(), self.crawler_type)
def on_status(self):
"""查看状态"""
try:
cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager')
CookieManager = cookie_module.CookieManager
cookie_manager = CookieManager()
if cookie_manager.is_cookie_exists():
cookie_data = cookie_manager.load_cookies()
if cookie_data:
metadata = cookie_data.get("metadata", {})
cookies = cookie_data.get("cookies", [])
info = f"""Cookie 状态信息:
文件路径: {cookie_manager.cookie_file}
Cookie 数量: {len(cookies)}
用户名: {metadata.get('username', '未知')}
创建时间: {metadata.get('created_at', '未知')}
最后验证: {metadata.get('last_validated', '未知')}
登录URL: {metadata.get('url', '未知')}"""
self.log(info)
else:
self.log("Cookie 文件存在但加载失败")
else:
self.log("Cookie 文件不存在,请先登录")
self.check_cookie_status()
except Exception as e:
self.log(f"查看状态出错: {e}")
def on_upload_cookie(self):
"""上传 Cookie 到服务器"""
# 检查配置
config = global_config_manager.load_config()
server_url = config.get('serverUrl', '').strip()
api_key = config.get('apiKey', '').strip()
if not server_url or not api_key:
messagebox.showwarning("配置缺失", "请先在配置中设置服务器地址和 API Key")
return
self.log("正在上传 Cookie 到服务器...")
self.set_running_state(True)
def run():
try:
# 获取 cookie 数据
cookie_module = import_with_crawler_context(self.crawler_type, 'core.cookie_manager')
CookieManager = cookie_module.CookieManager
cookie_manager = CookieManager()
if not cookie_manager.is_cookie_exists():
self.log("Cookie 文件不存在,请先登录")
return
cookie_data = cookie_manager.load_cookies()
if not cookie_data:
self.log("加载 Cookie 失败")
return
cookies = cookie_data.get("cookies", [])
if not cookies:
self.log("Cookie 为空")
return
# 将 cookie 列表转换为字符串格式name=value; name2=value2
cookie_string = '; '.join([f"{c['name']}={c['value']}" for c in cookies])
# 计算 MD5
md5_hash = hashlib.md5(cookie_string.encode('utf-8')).hexdigest()
# 获取平台配置
platform = PLATFORMS.get(self.crawler_type, {})
login_url = platform.get('loginUrl', '')
# 上传到服务器
upload_url = f"{server_url}/api/crawler/cookie/upload"
payload = {
'code': self.crawler_type,
'cookieString': cookie_string,
'md5': md5_hash,
'url': login_url
}
headers = {
'X-API-Key': api_key,
'Content-Type': 'application/json'
}
self.log(f"正在上传到: {upload_url}")
response = requests.post(upload_url, json=payload, headers=headers, timeout=30)
if response.status_code != 200:
self.log(f"上传失败: HTTP {response.status_code} - {response.text}")
return
result = response.json()
if result.get('code') == 200:
self.log(f"Cookie 上传成功!")
if result.get('data'):
self.log(f"服务器返回: {result.get('data')}")
else:
self.log(f"上传失败: {result.get('msg', '未知错误')}")
except requests.exceptions.Timeout:
self.log("上传超时,请检查网络连接")
except requests.exceptions.ConnectionError:
self.log("连接服务器失败,请检查服务器地址")
except Exception as e:
self.log(f"上传出错: {e}")
finally:
self.after(0, lambda: self.set_running_state(False))
self.running_thread = threading.Thread(target=run, daemon=True)
self.running_thread.start()
def on_crawl(self):
"""单次抓取 - 子类重写"""
pass
def on_daemon(self):
"""守护进程 - 子类重写"""
pass
def on_stop(self):
"""停止"""
self.stop_flag.set()
self.log("正在停止...")
# 给一个短暂的延迟后检查任务是否已停止
self.after(1000, self._check_task_stopped)
def _check_task_stopped(self):
"""检查任务是否已停止"""
if self.running_thread and self.running_thread.is_alive():
# 任务仍在运行,继续等待
self.after(1000, self._check_task_stopped)
else:
# 任务已停止
self.is_running = False
self.update_buttons_state()
self.log("任务已停止")
def set_running_state(self, running):
"""设置运行状态"""
self.is_running = running
if running:
self.stop_flag.clear()
self.update_buttons_state()
class CommerceTab(CrawlerTab):
"""巨量百应 Tab"""
def __init__(self, parent, log_queue):
super().__init__(parent, "commerce", log_queue)
def setup_params(self, parent):
# 每页数量
ctk.CTkLabel(parent, text="每页数量:").pack(side="left", padx=(10, 5), pady=5)
self.page_size_var = ctk.StringVar(value="20")
self.page_size_entry = ctk.CTkEntry(parent, width=60, textvariable=self.page_size_var)
self.page_size_entry.pack(side="left", padx=5, pady=5)
# 间隔时间
ctk.CTkLabel(parent, text="抓取间隔(小时):").pack(side="left", padx=(20, 5), pady=5)
self.interval_var = ctk.StringVar(value="1")
self.interval_entry = ctk.CTkEntry(parent, width=60, textvariable=self.interval_var)
self.interval_entry.pack(side="left", padx=5, pady=5)
def on_crawl(self):
"""单次抓取"""
self.log("开始抓取达人账号数据...")
self.set_running_state(True)
def run():
try:
service_module = import_with_crawler_context('commerce', 'services.daren_account_service')
DarenAccountService = service_module.DarenAccountService
page_size = int(self.page_size_var.get())
service = DarenAccountService()
filepath = service.crawl(page_size=page_size, status=1)
if filepath:
self.log(f"抓取完成! 文件: {filepath}")
else:
self.log("抓取失败")
except Exception as e:
self.log(f"抓取出错: {e}")
finally:
self.after(0, lambda: self.set_running_state(False))
self.running_thread = threading.Thread(target=run, daemon=True)
self.running_thread.start()
def on_daemon(self):
"""守护进程模式"""
self.log("启动守护进程模式...")
self.set_running_state(True)
def run():
try:
import time
service_module = import_with_crawler_context('commerce', 'services.daren_account_service')
DarenAccountService = service_module.DarenAccountService
page_size = int(self.page_size_var.get())
interval_hours = float(self.interval_var.get())
interval_seconds = interval_hours * 3600
self.log(f"守护进程已启动 - 间隔: {interval_hours} 小时")
cycle = 0
while not self.stop_flag.is_set():
cycle += 1
self.log(f"{cycle} 次抓取开始...")
try:
service = DarenAccountService()
filepath = service.crawl(page_size=page_size, status=1)
if filepath:
self.log(f"{cycle} 次抓取完成: {filepath.name}")
else:
self.log(f"{cycle} 次抓取失败")
except Exception as e:
self.log(f"{cycle} 次抓取出错: {e}")
# 等待下次抓取
self.log(f"等待 {interval_hours} 小时后进行下次抓取...")
# 分段等待,以便响应停止信号
waited = 0
while waited < interval_seconds and not self.stop_flag.is_set():
time.sleep(min(30, interval_seconds - waited))
waited += 30
self.log("守护进程已停止")
except Exception as e:
self.log(f"守护进程出错: {e}")
finally:
self.after(0, lambda: self.set_running_state(False))
self.running_thread = threading.Thread(target=run, daemon=True)
self.running_thread.start()
class EntertainmentTab(CrawlerTab):
"""娱乐版 Tab"""
def __init__(self, parent, log_queue):
super().__init__(parent, "entertainment", log_queue)
def setup_params(self, parent):
# 每页数量
ctk.CTkLabel(parent, text="每页数量:").pack(side="left", padx=(10, 5), pady=5)
self.size_var = ctk.StringVar(value="10")
self.size_entry = ctk.CTkEntry(parent, width=60, textvariable=self.size_var)
self.size_entry.pack(side="left", padx=5, pady=5)
# 抓取粉丝
self.fetch_fans_var = ctk.BooleanVar(value=True)
self.fetch_fans_cb = ctk.CTkCheckBox(
parent, text="抓取粉丝数据",
variable=self.fetch_fans_var
)
self.fetch_fans_cb.pack(side="left", padx=20, pady=5)
# 间隔时间
ctk.CTkLabel(parent, text="抓取间隔(小时):").pack(side="left", padx=(20, 5), pady=5)
self.interval_var = ctk.StringVar(value="1")
self.interval_entry = ctk.CTkEntry(parent, width=60, textvariable=self.interval_var)
self.interval_entry.pack(side="left", padx=5, pady=5)
def on_crawl(self):
"""单次抓取"""
self.log("开始抓取排行榜数据...")
self.set_running_state(True)
def run():
try:
service_module = import_with_crawler_context('entertainment', 'services.crawler_service')
CrawlerService = service_module.CrawlerService
size = int(self.size_var.get())
fetch_fans = self.fetch_fans_var.get()
service = CrawlerService()
filepath = service.crawl(
rank_type=0,
size=size,
filter_type='anchor',
fetch_fans=fetch_fans
)
if filepath:
self.log(f"抓取完成! 文件: {filepath}")
else:
self.log("抓取失败")
except Exception as e:
self.log(f"抓取出错: {e}")
finally:
self.after(0, lambda: self.set_running_state(False))
self.running_thread = threading.Thread(target=run, daemon=True)
self.running_thread.start()
def on_daemon(self):
"""守护进程模式"""
self.log("启动守护进程模式...")
self.set_running_state(True)
def run():
try:
import time
service_module = import_with_crawler_context('entertainment', 'services.crawler_service')
CrawlerService = service_module.CrawlerService
size = int(self.size_var.get())
fetch_fans = self.fetch_fans_var.get()
interval_hours = float(self.interval_var.get())
interval_seconds = interval_hours * 3600
self.log(f"守护进程已启动 - 间隔: {interval_hours} 小时")
cycle = 0
while not self.stop_flag.is_set():
cycle += 1
self.log(f"{cycle} 次抓取开始...")
try:
service = CrawlerService()
filepath = service.crawl(
rank_type=0,
size=size,
filter_type='anchor',
fetch_fans=fetch_fans
)
if filepath:
self.log(f"{cycle} 次抓取完成: {filepath.name}")
else:
self.log(f"{cycle} 次抓取失败")
except Exception as e:
self.log(f"{cycle} 次抓取出错: {e}")
# 等待下次抓取
self.log(f"等待 {interval_hours} 小时后进行下次抓取...")
# 分段等待
waited = 0
while waited < interval_seconds and not self.stop_flag.is_set():
time.sleep(min(30, interval_seconds - waited))
waited += 30
self.log("守护进程已停止")
except Exception as e:
self.log(f"守护进程出错: {e}")
finally:
self.after(0, lambda: self.set_running_state(False))
self.running_thread = threading.Thread(target=run, daemon=True)
self.running_thread.start()
class App(ctk.CTk):
"""主应用程序"""
# 日志最大行数,超过后自动截断
MAX_LOG_LINES = 1000
def __init__(self):
super().__init__()
self.title("字节直播数据采集工具")
self.geometry("900x650")
self.minsize(800, 550)
# 日志队列
self.log_queue = queue.Queue()
# 重定向标准输出到日志
self.setup_stdout_redirect()
self.setup_ui()
self.process_log_queue()
def setup_stdout_redirect(self):
"""重定向标准输出和错误输出到日志队列"""
class StdoutRedirector:
def __init__(self, log_queue, original_stdout):
self.log_queue = log_queue
self.original_stdout = original_stdout
self.buffer = ""
def write(self, message):
# 同时输出到原始stdout控制台
if self.original_stdout:
try:
self.original_stdout.write(message)
except:
pass
# 处理消息,按行分割
if message:
self.buffer += message
while '\n' in self.buffer:
line, self.buffer = self.buffer.split('\n', 1)
if line.strip():
self.log_queue.put(f"{line}\n")
def flush(self):
if self.original_stdout:
try:
self.original_stdout.flush()
except:
pass
# 刷新缓冲区中剩余的内容
if self.buffer.strip():
self.log_queue.put(f"{self.buffer}\n")
self.buffer = ""
# 保存原始的stdout和stderr
self.original_stdout = sys.stdout
self.original_stderr = sys.stderr
# 重定向
sys.stdout = StdoutRedirector(self.log_queue, self.original_stdout)
sys.stderr = StdoutRedirector(self.log_queue, self.original_stderr)
def setup_ui(self):
# 标题
title_label = ctk.CTkLabel(
self,
text="直播数据采集工具",
font=ctk.CTkFont(size=20, weight="bold")
)
title_label.pack(pady=10)
# Tab 视图
self.tabview = ctk.CTkTabview(self)
self.tabview.pack(fill="both", expand=True, padx=10, pady=5)
# 添加 Tab
self.tabview.add("巨量百应")
self.tabview.add("字节联盟-主播排行")
# 创建 Tab 内容
self.commerce_tab = CommerceTab(
self.tabview.tab("巨量百应"),
self.log_queue
)
self.commerce_tab.pack(fill="both", expand=True)
self.entertainment_tab = EntertainmentTab(
self.tabview.tab("字节联盟-主播排行"),
self.log_queue
)
self.entertainment_tab.pack(fill="both", expand=True)
# 设置默认选中的 Tab
self.tabview.set("字节联盟-主播排行")
# 日志区域
log_frame = ctk.CTkFrame(self)
log_frame.pack(fill="both", expand=True, padx=10, pady=10)
# 日志标题行
log_header = ctk.CTkFrame(log_frame, fg_color="transparent")
log_header.pack(fill="x", padx=5, pady=(5, 0))
log_label = ctk.CTkLabel(log_header, text="运行日志:", anchor="w")
log_label.pack(side="left")
# 日志行数显示
self.log_count_label = ctk.CTkLabel(log_header, text="(0 行)", anchor="w", text_color="gray")
self.log_count_label.pack(side="left", padx=5)
self.log_text = ctk.CTkTextbox(log_frame, height=150)
self.log_text.pack(fill="both", expand=True, padx=5, pady=5)
# 按钮区域
btn_frame = ctk.CTkFrame(log_frame, fg_color="transparent")
btn_frame.pack(fill="x", padx=5, pady=5)
# 左侧按钮
clear_btn = ctk.CTkButton(
btn_frame, text="清空日志", width=80,
command=self.clear_log
)
clear_btn.pack(side="left", padx=5)
# 右侧按钮
exit_btn = ctk.CTkButton(
btn_frame, text="退出程序", width=80,
fg_color="red", hover_color="darkred",
command=self.on_exit
)
exit_btn.pack(side="right", padx=5)
# 初始日志
self.log("程序启动完成,请选择爬虫类型并操作")
def log(self, message):
"""写入日志"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
self.log_queue.put(f"[{timestamp}] {message}\n")
def clear_log(self):
"""清空日志"""
self.log_text.delete("1.0", "end")
self.update_log_count()
def update_log_count(self):
"""更新日志行数显示"""
try:
# 获取行数
content = self.log_text.get("1.0", "end-1c")
line_count = content.count('\n') + 1 if content else 0
self.log_count_label.configure(text=f"({line_count} 行)")
except:
pass
def truncate_log_if_needed(self):
"""如果日志超过最大行数,截断旧日志"""
try:
content = self.log_text.get("1.0", "end-1c")
lines = content.split('\n')
if len(lines) > self.MAX_LOG_LINES:
# 保留最新的日志
keep_lines = lines[-self.MAX_LOG_LINES:]
self.log_text.delete("1.0", "end")
self.log_text.insert("1.0", '\n'.join(keep_lines))
# 添加截断提示
self.log_text.insert("1.0", f"[日志已截断,保留最新 {self.MAX_LOG_LINES} 行]\n")
except:
pass
def process_log_queue(self):
"""处理日志队列"""
message_count = 0
try:
while True:
message = self.log_queue.get_nowait()
self.log_text.insert("end", message)
message_count += 1
# 每处理50条消息检查一次是否需要截断
if message_count % 50 == 0:
self.truncate_log_if_needed()
except queue.Empty:
pass
if message_count > 0:
self.log_text.see("end")
self.update_log_count()
# 处理完一批消息后检查截断
self.truncate_log_if_needed()
# 每 100ms 检查一次
self.after(100, self.process_log_queue)
def on_exit(self):
"""退出程序"""
if messagebox.askyesno("确认", "确定要退出程序吗?"):
# 恢复标准输出
sys.stdout = self.original_stdout
sys.stderr = self.original_stderr
self.destroy()
def main():
app = App()
app.mainloop()
if __name__ == "__main__":
main()