""" 主程序入口 提供命令行接口 """ import sys import argparse import time import gc from datetime import datetime, timedelta from core.browser_login import BrowserLogin from core.cookie_manager import CookieManager from services.crawler_service import CrawlerService from services.daren_account_service import DarenAccountService from config.settings import DAEMON_INTERVAL_HOURS from utils.logger import logger def login_command(): """触发登录流程""" try: browser_login = BrowserLogin() cookies = browser_login.trigger_login() if cookies: print("\n✅ 登录流程完成!") return 0 else: print("\n❌ 登录流程失败") return 1 except KeyboardInterrupt: print("\n\n⚠️ 用户中断登录流程") logger.info("用户中断登录") return 1 except Exception as e: logger.error(f"登录命令执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 def status_command(): """查看Cookie状态""" cookie_manager = CookieManager() print("\n" + "="*60) print("📊 Cookie状态") print("="*60) if cookie_manager.is_cookie_exists(): cookie_data = cookie_manager.load_cookies() if cookie_data: metadata = cookie_data.get("metadata", {}) cookies = cookie_data.get("cookies", []) print(f"✅ Cookie文件存在") print(f"📁 文件路径: {cookie_manager.cookie_file}") print(f"📦 Cookie数量: {len(cookies)}") print(f"👤 用户名: {metadata.get('username', 'Unknown')}") print(f"🕐 创建时间: {metadata.get('created_at', 'Unknown')}") print(f"🕐 最后验证: {metadata.get('last_validated', 'Unknown')}") print(f"🔗 登录URL: {metadata.get('url', 'Unknown')}") else: print("❌ Cookie文件存在但加载失败") else: print("❌ Cookie文件不存在") print("💡 提示: 运行 'python main.py login' 进行登录") print("="*60 + "\n") return 0 def crawl_command(args): """抓取排行榜数据""" try: crawler_service = CrawlerService() # 确定是否抓取粉丝数据 fetch_fans = args.fetch_fans if hasattr(args, 'fetch_fans') else True filepath = crawler_service.crawl( rank_type=args.rank_type, size=args.size, filter_type=args.filter_type, fetch_fans=fetch_fans ) if filepath: return 0 else: return 1 except KeyboardInterrupt: print("\n\n⚠️ 用户中断抓取流程") logger.info("用户中断抓取") return 1 except Exception as e: logger.error(f"抓取命令执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 def fetch_daren_command(args): """抓取达人账号数据""" try: service = DarenAccountService() page_size = args.page_size if hasattr(args, 'page_size') else 20 status = args.status if hasattr(args, 'status') else 1 filepath = service.crawl(page_size=page_size, status=status) if filepath: print(f"\n✅ 数据抓取成功!") print(f"📁 文件路径: {filepath}") return 0 else: print(f"\n❌ 数据抓取失败") return 1 except KeyboardInterrupt: print("\n\n⚠️ 用户中断抓取") logger.info("用户中断抓取") return 1 except Exception as e: logger.error(f"抓取命令执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 def daemon_command(args): """循环运行模式(守护进程)- 定时抓取达人账号数据""" try: # 解析配置 interval_hours = args.interval if hasattr(args, 'interval') and args.interval else DAEMON_INTERVAL_HOURS interval_seconds = interval_hours * 3600 page_size = args.page_size if hasattr(args, 'page_size') else 20 status = args.status if hasattr(args, 'status') else 1 print("\n" + "="*60) print("🔄 启动循环运行模式(守护进程)") print(f"📊 数据类型: 达人账号数据") print(f"⏰ 抓取间隔: {interval_hours} 小时") print(f"📄 每页数量: {page_size}") print("="*60) print("💡 提示: 按 Ctrl+C 停止运行\n") logger.info(f"守护进程启动 - 抓取间隔: {interval_hours}小时, 每页数量: {page_size}") # 初始化时间追踪(立即执行第一次) last_crawl_time = datetime.now() - timedelta(seconds=interval_seconds) cycle_count = 0 consecutive_failures = 0 # 连续失败次数 max_consecutive_failures = 3 # 最大连续失败次数 while True: current_time = datetime.now() # 检查是否需要抓取 time_since_last = (current_time - last_crawl_time).total_seconds() need_crawl = time_since_last >= interval_seconds if need_crawl: cycle_count += 1 service = None try: # 创建新的服务实例,避免内存泄漏 service = DarenAccountService() print(f"\n[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] 🚀 开始第 {cycle_count} 次抓取") logger.info(f"开始第 {cycle_count} 次抓取达人账号数据") # 执行抓取 filepath = service.crawl(page_size=page_size, status=status) if filepath: print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ✅ 抓取完成: {filepath.name}") logger.info(f"第 {cycle_count} 次抓取成功: {filepath.name}") # 重置连续失败计数 consecutive_failures = 0 # 更新最后执行时间 last_crawl_time = current_time else: consecutive_failures += 1 print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取失败 (连续失败 {consecutive_failures} 次)") logger.error(f"第 {cycle_count} 次抓取失败 (连续失败 {consecutive_failures} 次)") # 如果连续失败次数过多,给出警告 if consecutive_failures >= max_consecutive_failures: print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!") print(" 可能的原因:") print(" 1. Cookie已失效,需要重新登录") print(" 2. 网络连接问题") print(" 3. API服务异常") print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie") print(" 程序将继续运行,等待下次抓取...\n") logger.warning(f"连续失败 {consecutive_failures} 次,可能Cookie已失效") # 即使失败也更新时间,避免重复失败 last_crawl_time = current_time except Exception as e: consecutive_failures += 1 error_msg = str(e)[:100] print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取异常: {error_msg} (连续失败 {consecutive_failures} 次)") logger.error(f"第 {cycle_count} 次抓取异常: {e} (连续失败 {consecutive_failures} 次)", exc_info=True) # 如果连续失败次数过多,给出警告 if consecutive_failures >= max_consecutive_failures: print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!") print(" 可能的原因:") print(" 1. Cookie已失效,需要重新登录") print(" 2. 网络连接问题") print(" 3. 程序异常") print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie") print(" 程序将继续运行,等待下次抓取...\n") logger.warning(f"连续失败 {consecutive_failures} 次,可能Cookie已失效") # 即使异常也更新时间,避免重复异常 last_crawl_time = current_time finally: # 清理资源,防止内存泄漏 if service is not None: del service gc.collect() # 计算下次抓取时间 check_time = datetime.now() next_check_seconds = max(0, interval_seconds - (check_time - last_crawl_time).total_seconds()) if next_check_seconds > 0: # 显示下次执行时间 next_time = check_time + timedelta(seconds=next_check_seconds) print(f"[{check_time.strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')}", end='\r') # 分段等待,每30秒更新一次显示 wait_interval = 30 waited = 0 while waited < next_check_seconds: sleep_time = min(wait_interval, next_check_seconds - waited) time.sleep(sleep_time) waited += sleep_time # 更新显示 if waited < next_check_seconds: remaining = next_check_seconds - waited next_time = datetime.now() + timedelta(seconds=remaining) # 每分钟更新一次显示 if remaining % 60 < 30 or remaining < 60: hours = int(remaining // 3600) minutes = int((remaining % 3600) // 60) seconds = int(remaining % 60) time_str = f"{hours}小时{minutes}分{seconds}秒" if hours > 0 else f"{minutes}分{seconds}秒" print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')} (剩余 {time_str})", end='\r') print() # 换行 except KeyboardInterrupt: print("\n\n⚠️ 用户中断守护进程") logger.info("守护进程被用户中断") return 0 except Exception as e: logger.error(f"守护进程执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 def main(): """主函数""" parser = argparse.ArgumentParser( description='巨量百应 - MCN机构数据抓取工具', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: python main.py login # 触发登录流程 python main.py status # 查看Cookie状态 python main.py fetch-daren # 抓取达人账号数据(单次) python main.py fetch-daren --page-size 50 # 自定义每页数量 python main.py daemon # 守护进程模式(默认1小时抓取一次) python main.py daemon --interval 2 # 自定义抓取间隔(2小时) """ ) subparsers = parser.add_subparsers(dest='command', help='可用命令') # login命令 login_parser = subparsers.add_parser('login', help='触发登录流程') # status命令 status_parser = subparsers.add_parser('status', help='查看Cookie状态') # crawl命令 crawl_parser = subparsers.add_parser('crawl', help='抓取排行榜数据') crawl_parser.add_argument('--rank-type', type=int, default=0, help='排行类型,0=总榜') crawl_parser.add_argument('--size', type=int, default=10, help='每页数量') crawl_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型') # 粉丝数据抓取选项(使用互斥组) fans_group = crawl_parser.add_mutually_exclusive_group() fans_group.add_argument('--fetch-fans', action='store_true', default=True, dest='fetch_fans', help='抓取粉丝数据(默认)') fans_group.add_argument('--no-fetch-fans', action='store_false', dest='fetch_fans', help='不抓取粉丝数据') # fetch-daren命令(抓取达人账号数据) fetch_daren_parser = subparsers.add_parser('fetch-daren', help='抓取MCN机构绑定的达人账号数据') fetch_daren_parser.add_argument('--page-size', type=int, default=20, help='每页数量(默认20)') fetch_daren_parser.add_argument('--status', type=int, default=1, help='状态(1=全部,默认1)') # daemon命令(循环运行模式) daemon_parser = subparsers.add_parser('daemon', help='循环运行模式(守护进程),定时自动抓取达人账号数据') daemon_parser.add_argument('--interval', type=float, default=None, help=f'抓取间隔(小时,默认: {DAEMON_INTERVAL_HOURS})') daemon_parser.add_argument('--page-size', type=int, default=20, help='每页数量(默认20)') daemon_parser.add_argument('--status', type=int, default=1, help='状态(默认1=全部)') args = parser.parse_args() if not args.command: parser.print_help() return 1 try: if args.command == 'login': return login_command() elif args.command == 'status': return status_command() elif args.command == 'crawl': return crawl_command(args) elif args.command == 'fetch-daren': return fetch_daren_command(args) elif args.command == 'daemon': return daemon_command(args) else: parser.print_help() return 1 except Exception as e: logger.error(f"程序执行失败: {e}", exc_info=True) print(f"\n❌ 发生错误: {e}") return 1 if __name__ == '__main__': sys.exit(main())