""" 主程序入口 提供命令行接口 """ import sys import argparse import time import gc from datetime import datetime, timedelta from core.browser_login import BrowserLogin from core.cookie_manager import CookieManager from services.crawler_service import CrawlerService from config.settings import ( DAEMON_INTERVAL_NO_FANS_HOURS, DAEMON_INTERVAL_WITH_FANS_HOURS ) from utils.logger import logger def login_command(): """触发登录流程""" try: browser_login = BrowserLogin() cookies = browser_login.trigger_login() if cookies: print("\n✅ 登录流程完成!") return 0 else: print("\n❌ 登录流程失败") return 1 except KeyboardInterrupt: print("\n\n⚠️ 用户中断登录流程") logger.info("用户中断登录") return 1 except Exception as e: logger.error(f"登录命令执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 def status_command(): """查看Cookie状态""" cookie_manager = CookieManager() print("\n" + "="*60) print("📊 Cookie状态") print("="*60) if cookie_manager.is_cookie_exists(): cookie_data = cookie_manager.load_cookies() if cookie_data: metadata = cookie_data.get("metadata", {}) cookies = cookie_data.get("cookies", []) print(f"✅ Cookie文件存在") print(f"📁 文件路径: {cookie_manager.cookie_file}") print(f"📦 Cookie数量: {len(cookies)}") print(f"👤 用户名: {metadata.get('username', 'Unknown')}") print(f"🕐 创建时间: {metadata.get('created_at', 'Unknown')}") print(f"🕐 最后验证: {metadata.get('last_validated', 'Unknown')}") print(f"🔗 登录URL: {metadata.get('url', 'Unknown')}") else: print("❌ Cookie文件存在但加载失败") else: print("❌ Cookie文件不存在") print("💡 提示: 运行 'python main.py login' 进行登录") print("="*60 + "\n") return 0 def crawl_command(args): """抓取排行榜数据""" try: crawler_service = CrawlerService() # 确定是否抓取粉丝数据 fetch_fans = args.fetch_fans if hasattr(args, 'fetch_fans') else True filepath = crawler_service.crawl( rank_type=args.rank_type, size=args.size, filter_type=args.filter_type, fetch_fans=fetch_fans ) if filepath: return 0 else: return 1 except KeyboardInterrupt: print("\n\n⚠️ 用户中断抓取流程") logger.info("用户中断抓取") return 1 except Exception as e: logger.error(f"抓取命令执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 def daemon_command(args): """循环运行模式(守护进程)""" try: # 解析配置 interval_no_fans = args.interval_no_fans if args.interval_no_fans else DAEMON_INTERVAL_NO_FANS_HOURS interval_with_fans = args.interval_with_fans if args.interval_with_fans else DAEMON_INTERVAL_WITH_FANS_HOURS # 转换为秒 interval_no_fans_seconds = interval_no_fans * 3600 interval_with_fans_seconds = interval_with_fans * 3600 print("\n" + "="*60) print("🔄 启动循环运行模式(守护进程)") print(f"⏰ 不包含粉丝数据抓取间隔: {interval_no_fans} 小时") print(f"⏰ 包含粉丝数据抓取间隔: {interval_with_fans} 小时") print("="*60) print("💡 提示: 按 Ctrl+C 停止运行\n") logger.info(f"守护进程启动 - 不包含粉丝间隔: {interval_no_fans}小时, 包含粉丝间隔: {interval_with_fans}小时") # 初始化时间追踪 last_no_fans_time = datetime.now() - timedelta(seconds=interval_no_fans_seconds) # 立即执行第一次 last_with_fans_time = datetime.now() - timedelta(seconds=interval_with_fans_seconds) # 立即执行第一次 cycle_count = 0 consecutive_failures = 0 # 连续失败次数 max_consecutive_failures = 3 # 最大连续失败次数,超过后给出警告 while True: current_time = datetime.now() # 检查是否需要抓取不包含粉丝的数据 time_since_no_fans = (current_time - last_no_fans_time).total_seconds() need_no_fans = time_since_no_fans >= interval_no_fans_seconds # 检查是否需要抓取包含粉丝的数据 time_since_with_fans = (current_time - last_with_fans_time).total_seconds() need_with_fans = time_since_with_fans >= interval_with_fans_seconds if need_no_fans or need_with_fans: cycle_count += 1 # 只在执行抓取时增加计数 # 确定本次抓取是否包含粉丝 fetch_fans = need_with_fans # 如果到了包含粉丝的时间,优先抓取包含粉丝的数据 crawler_service = None try: # 创建新的服务实例,避免内存泄漏 crawler_service = CrawlerService() # 使用静默模式,减少控制台输出 print(f"\n[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] 开始第 {cycle_count} 次抓取 ({'包含' if fetch_fans else '不包含'}粉丝数据)") logger.info(f"开始第 {cycle_count} 次抓取 - 包含粉丝: {fetch_fans}") filepath = crawler_service.crawl( rank_type=args.rank_type, size=args.size, filter_type=args.filter_type, fetch_fans=fetch_fans ) if filepath: print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ✅ 抓取完成: {filepath.name}") logger.info(f"第 {cycle_count} 次抓取成功: {filepath.name}") # 重置连续失败计数 consecutive_failures = 0 # 更新最后执行时间 if fetch_fans: last_with_fans_time = current_time last_no_fans_time = current_time # 包含粉丝的抓取也满足不包含粉丝的需求 else: last_no_fans_time = current_time else: consecutive_failures += 1 print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取失败 (连续失败 {consecutive_failures} 次)") logger.error(f"第 {cycle_count} 次抓取失败 (连续失败 {consecutive_failures} 次)") # 如果连续失败次数过多,给出警告 if consecutive_failures >= max_consecutive_failures: print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!") print(" 可能的原因:") print(" 1. Cookie已失效,需要重新登录") print(" 2. 网络连接问题") print(" 3. API服务异常") print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie") print(" 程序将继续运行,等待下次抓取...\n") logger.warning(f"连续失败 {consecutive_failures} 次,可能Cookie已失效") # 即使失败也更新时间,避免重复失败 if fetch_fans: last_with_fans_time = current_time last_no_fans_time = current_time except Exception as e: consecutive_failures += 1 error_msg = str(e)[:100] print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取异常: {error_msg} (连续失败 {consecutive_failures} 次)") logger.error(f"第 {cycle_count} 次抓取异常: {e} (连续失败 {consecutive_failures} 次)", exc_info=True) # 如果连续失败次数过多,给出警告 if consecutive_failures >= max_consecutive_failures: print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!") print(" 可能的原因:") print(" 1. Cookie已失效,需要重新登录") print(" 2. 网络连接问题") print(" 3. API服务异常") print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie") print(" 程序将继续运行,等待下次抓取...\n") logger.warning(f"连续失败 {consecutive_failures} 次,可能Cookie已失效") # 即使异常也更新时间,避免重复异常 if fetch_fans: last_with_fans_time = current_time last_no_fans_time = current_time finally: # 清理资源,防止内存泄漏 if crawler_service is not None: del crawler_service gc.collect() # 计算下次检查时间(取两个间隔中较小的) check_time = datetime.now() next_check_seconds = min( max(0, interval_no_fans_seconds - (check_time - last_no_fans_time).total_seconds()), max(0, interval_with_fans_seconds - (check_time - last_with_fans_time).total_seconds()) ) # 如果两个都到了,立即执行,否则等待 if next_check_seconds > 0: # 显示下次执行时间(只在开始时显示一次,减少输出) next_time = check_time + timedelta(seconds=next_check_seconds) print(f"[{check_time.strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')}", end='\r') # 分段等待,每30秒更新一次显示,避免控制台输出过多 wait_interval = 30 # 每30秒更新一次显示 waited = 0 while waited < next_check_seconds: sleep_time = min(wait_interval, next_check_seconds - waited) time.sleep(sleep_time) waited += sleep_time # 每30秒更新一次显示,或者剩余时间少于30秒时更新 if waited < next_check_seconds: remaining = next_check_seconds - waited next_time = datetime.now() + timedelta(seconds=remaining) # 只在剩余时间变化较大时更新显示,减少刷新频率 if remaining % 60 < 30 or remaining < 60: # 每分钟更新一次,或剩余时间少于1分钟时频繁更新 print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')} ({int(remaining//60)}分{int(remaining%60)}秒)", end='\r') print() # 换行 except KeyboardInterrupt: print("\n\n⚠️ 用户中断守护进程") logger.info("守护进程被用户中断") return 0 except Exception as e: logger.error(f"守护进程执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 def main(): """主函数""" parser = argparse.ArgumentParser( description='字节跳动直播服务平台数据抓取工具', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: python main.py login # 触发登录流程 python main.py status # 查看Cookie状态 """ ) subparsers = parser.add_subparsers(dest='command', help='可用命令') # login命令 login_parser = subparsers.add_parser('login', help='触发登录流程') # status命令 status_parser = subparsers.add_parser('status', help='查看Cookie状态') # crawl命令 crawl_parser = subparsers.add_parser('crawl', help='抓取排行榜数据') crawl_parser.add_argument('--rank-type', type=int, default=0, help='排行类型,0=总榜') crawl_parser.add_argument('--size', type=int, default=10, help='每页数量') crawl_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型') # 粉丝数据抓取选项(使用互斥组) fans_group = crawl_parser.add_mutually_exclusive_group() fans_group.add_argument('--fetch-fans', action='store_true', default=True, dest='fetch_fans', help='抓取粉丝数据(默认)') fans_group.add_argument('--no-fetch-fans', action='store_false', dest='fetch_fans', help='不抓取粉丝数据') # daemon命令(循环运行模式) daemon_parser = subparsers.add_parser('daemon', help='循环运行模式(守护进程),定时自动抓取数据') daemon_parser.add_argument('--rank-type', type=int, default=0, help='排行类型,0=总榜') daemon_parser.add_argument('--size', type=int, default=10, help='每页数量') daemon_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型') daemon_parser.add_argument('--interval-no-fans', type=float, default=None, help=f'不包含粉丝数据的抓取间隔(小时,默认: {DAEMON_INTERVAL_NO_FANS_HOURS})') daemon_parser.add_argument('--interval-with-fans', type=float, default=None, help=f'包含粉丝数据的抓取间隔(小时,默认: {DAEMON_INTERVAL_WITH_FANS_HOURS})') args = parser.parse_args() if not args.command: parser.print_help() return 1 try: if args.command == 'login': return login_command() elif args.command == 'status': return status_command() elif args.command == 'crawl': return crawl_command(args) elif args.command == 'daemon': return daemon_command(args) else: parser.print_help() return 1 except Exception as e: logger.error(f"程序执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 if __name__ == '__main__': sys.exit(main())