live-forum/server/crawler/entertainment/main.py
2026-03-24 11:27:37 +08:00

336 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
主程序入口
提供命令行接口
"""
import sys
import argparse
import time
import gc
from datetime import datetime, timedelta
from core.browser_login import BrowserLogin
from core.cookie_manager import CookieManager
from services.crawler_service import CrawlerService
from config.settings import (
DAEMON_INTERVAL_NO_FANS_HOURS,
DAEMON_INTERVAL_WITH_FANS_HOURS
)
from utils.logger import logger
def login_command():
"""触发登录流程"""
try:
browser_login = BrowserLogin()
cookies = browser_login.trigger_login()
if cookies:
print("\n✅ 登录流程完成!")
return 0
else:
print("\n❌ 登录流程失败")
return 1
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断登录流程")
logger.info("用户中断登录")
return 1
except Exception as e:
logger.error(f"登录命令执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
def status_command():
"""查看Cookie状态"""
cookie_manager = CookieManager()
print("\n" + "="*60)
print("📊 Cookie状态")
print("="*60)
if cookie_manager.is_cookie_exists():
cookie_data = cookie_manager.load_cookies()
if cookie_data:
metadata = cookie_data.get("metadata", {})
cookies = cookie_data.get("cookies", [])
print(f"✅ Cookie文件存在")
print(f"📁 文件路径: {cookie_manager.cookie_file}")
print(f"📦 Cookie数量: {len(cookies)}")
print(f"👤 用户名: {metadata.get('username', 'Unknown')}")
print(f"🕐 创建时间: {metadata.get('created_at', 'Unknown')}")
print(f"🕐 最后验证: {metadata.get('last_validated', 'Unknown')}")
print(f"🔗 登录URL: {metadata.get('url', 'Unknown')}")
else:
print("❌ Cookie文件存在但加载失败")
else:
print("❌ Cookie文件不存在")
print("💡 提示: 运行 'python main.py login' 进行登录")
print("="*60 + "\n")
return 0
def crawl_command(args):
"""抓取排行榜数据"""
try:
crawler_service = CrawlerService()
# 确定是否抓取粉丝数据
fetch_fans = args.fetch_fans if hasattr(args, 'fetch_fans') else True
filepath = crawler_service.crawl(
rank_type=args.rank_type,
size=args.size,
filter_type=args.filter_type,
fetch_fans=fetch_fans
)
if filepath:
return 0
else:
return 1
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断抓取流程")
logger.info("用户中断抓取")
return 1
except Exception as e:
logger.error(f"抓取命令执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
def daemon_command(args):
"""循环运行模式(守护进程)"""
try:
# 解析配置
interval_no_fans = args.interval_no_fans if args.interval_no_fans else DAEMON_INTERVAL_NO_FANS_HOURS
interval_with_fans = args.interval_with_fans if args.interval_with_fans else DAEMON_INTERVAL_WITH_FANS_HOURS
# 转换为秒
interval_no_fans_seconds = interval_no_fans * 3600
interval_with_fans_seconds = interval_with_fans * 3600
print("\n" + "="*60)
print("🔄 启动循环运行模式(守护进程)")
print(f"⏰ 不包含粉丝数据抓取间隔: {interval_no_fans} 小时")
print(f"⏰ 包含粉丝数据抓取间隔: {interval_with_fans} 小时")
print("="*60)
print("💡 提示: 按 Ctrl+C 停止运行\n")
logger.info(f"守护进程启动 - 不包含粉丝间隔: {interval_no_fans}小时, 包含粉丝间隔: {interval_with_fans}小时")
# 初始化时间追踪
last_no_fans_time = datetime.now() - timedelta(seconds=interval_no_fans_seconds) # 立即执行第一次
last_with_fans_time = datetime.now() - timedelta(seconds=interval_with_fans_seconds) # 立即执行第一次
cycle_count = 0
consecutive_failures = 0 # 连续失败次数
max_consecutive_failures = 3 # 最大连续失败次数,超过后给出警告
while True:
current_time = datetime.now()
# 检查是否需要抓取不包含粉丝的数据
time_since_no_fans = (current_time - last_no_fans_time).total_seconds()
need_no_fans = time_since_no_fans >= interval_no_fans_seconds
# 检查是否需要抓取包含粉丝的数据
time_since_with_fans = (current_time - last_with_fans_time).total_seconds()
need_with_fans = time_since_with_fans >= interval_with_fans_seconds
if need_no_fans or need_with_fans:
cycle_count += 1 # 只在执行抓取时增加计数
# 确定本次抓取是否包含粉丝
fetch_fans = need_with_fans # 如果到了包含粉丝的时间,优先抓取包含粉丝的数据
crawler_service = None
try:
# 创建新的服务实例,避免内存泄漏
crawler_service = CrawlerService()
# 使用静默模式,减少控制台输出
print(f"\n[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] 开始第 {cycle_count} 次抓取 ({'包含' if fetch_fans else '不包含'}粉丝数据)")
logger.info(f"开始第 {cycle_count} 次抓取 - 包含粉丝: {fetch_fans}")
filepath = crawler_service.crawl(
rank_type=args.rank_type,
size=args.size,
filter_type=args.filter_type,
fetch_fans=fetch_fans
)
if filepath:
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ✅ 抓取完成: {filepath.name}")
logger.info(f"{cycle_count} 次抓取成功: {filepath.name}")
# 重置连续失败计数
consecutive_failures = 0
# 更新最后执行时间
if fetch_fans:
last_with_fans_time = current_time
last_no_fans_time = current_time # 包含粉丝的抓取也满足不包含粉丝的需求
else:
last_no_fans_time = current_time
else:
consecutive_failures += 1
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取失败 (连续失败 {consecutive_failures} 次)")
logger.error(f"{cycle_count} 次抓取失败 (连续失败 {consecutive_failures} 次)")
# 如果连续失败次数过多,给出警告
if consecutive_failures >= max_consecutive_failures:
print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!")
print(" 可能的原因:")
print(" 1. Cookie已失效需要重新登录")
print(" 2. 网络连接问题")
print(" 3. API服务异常")
print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie")
print(" 程序将继续运行,等待下次抓取...\n")
logger.warning(f"连续失败 {consecutive_failures}可能Cookie已失效")
# 即使失败也更新时间,避免重复失败
if fetch_fans:
last_with_fans_time = current_time
last_no_fans_time = current_time
except Exception as e:
consecutive_failures += 1
error_msg = str(e)[:100]
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取异常: {error_msg} (连续失败 {consecutive_failures} 次)")
logger.error(f"{cycle_count} 次抓取异常: {e} (连续失败 {consecutive_failures} 次)", exc_info=True)
# 如果连续失败次数过多,给出警告
if consecutive_failures >= max_consecutive_failures:
print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!")
print(" 可能的原因:")
print(" 1. Cookie已失效需要重新登录")
print(" 2. 网络连接问题")
print(" 3. API服务异常")
print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie")
print(" 程序将继续运行,等待下次抓取...\n")
logger.warning(f"连续失败 {consecutive_failures}可能Cookie已失效")
# 即使异常也更新时间,避免重复异常
if fetch_fans:
last_with_fans_time = current_time
last_no_fans_time = current_time
finally:
# 清理资源,防止内存泄漏
if crawler_service is not None:
del crawler_service
gc.collect()
# 计算下次检查时间(取两个间隔中较小的)
check_time = datetime.now()
next_check_seconds = min(
max(0, interval_no_fans_seconds - (check_time - last_no_fans_time).total_seconds()),
max(0, interval_with_fans_seconds - (check_time - last_with_fans_time).total_seconds())
)
# 如果两个都到了,立即执行,否则等待
if next_check_seconds > 0:
# 显示下次执行时间(只在开始时显示一次,减少输出)
next_time = check_time + timedelta(seconds=next_check_seconds)
print(f"[{check_time.strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')}", end='\r')
# 分段等待每30秒更新一次显示避免控制台输出过多
wait_interval = 30 # 每30秒更新一次显示
waited = 0
while waited < next_check_seconds:
sleep_time = min(wait_interval, next_check_seconds - waited)
time.sleep(sleep_time)
waited += sleep_time
# 每30秒更新一次显示或者剩余时间少于30秒时更新
if waited < next_check_seconds:
remaining = next_check_seconds - waited
next_time = datetime.now() + timedelta(seconds=remaining)
# 只在剩余时间变化较大时更新显示,减少刷新频率
if remaining % 60 < 30 or remaining < 60: # 每分钟更新一次或剩余时间少于1分钟时频繁更新
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')} ({int(remaining//60)}{int(remaining%60)}秒)", end='\r')
print() # 换行
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断守护进程")
logger.info("守护进程被用户中断")
return 0
except Exception as e:
logger.error(f"守护进程执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='字节跳动直播服务平台数据抓取工具',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
python main.py login # 触发登录流程
python main.py status # 查看Cookie状态
"""
)
subparsers = parser.add_subparsers(dest='command', help='可用命令')
# login命令
login_parser = subparsers.add_parser('login', help='触发登录流程')
# status命令
status_parser = subparsers.add_parser('status', help='查看Cookie状态')
# crawl命令
crawl_parser = subparsers.add_parser('crawl', help='抓取排行榜数据')
crawl_parser.add_argument('--rank-type', type=int, default=0, help='排行类型0=总榜')
crawl_parser.add_argument('--size', type=int, default=10, help='每页数量')
crawl_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')
# 粉丝数据抓取选项(使用互斥组)
fans_group = crawl_parser.add_mutually_exclusive_group()
fans_group.add_argument('--fetch-fans', action='store_true', default=True, dest='fetch_fans', help='抓取粉丝数据(默认)')
fans_group.add_argument('--no-fetch-fans', action='store_false', dest='fetch_fans', help='不抓取粉丝数据')
# daemon命令循环运行模式
daemon_parser = subparsers.add_parser('daemon', help='循环运行模式(守护进程),定时自动抓取数据')
daemon_parser.add_argument('--rank-type', type=int, default=0, help='排行类型0=总榜')
daemon_parser.add_argument('--size', type=int, default=10, help='每页数量')
daemon_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')
daemon_parser.add_argument('--interval-no-fans', type=float, default=None,
help=f'不包含粉丝数据的抓取间隔(小时,默认: {DAEMON_INTERVAL_NO_FANS_HOURS}')
daemon_parser.add_argument('--interval-with-fans', type=float, default=None,
help=f'包含粉丝数据的抓取间隔(小时,默认: {DAEMON_INTERVAL_WITH_FANS_HOURS}')
args = parser.parse_args()
if not args.command:
parser.print_help()
return 1
try:
if args.command == 'login':
return login_command()
elif args.command == 'status':
return status_command()
elif args.command == 'crawl':
return crawl_command(args)
elif args.command == 'daemon':
return daemon_command(args)
else:
parser.print_help()
return 1
except Exception as e:
logger.error(f"程序执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
if __name__ == '__main__':
sys.exit(main())