live-forum/server/crawler/entertainment/main.py

"""
主程序入口
提供命令行接口
"""
import sys
import argparse
import time
import gc
from datetime import datetime, timedelta
from core.browser_login import BrowserLogin
from core.cookie_manager import CookieManager
from services.crawler_service import CrawlerService
from config.settings import (
    DAEMON_INTERVAL_NO_FANS_HOURS,
    DAEMON_INTERVAL_WITH_FANS_HOURS
)
from utils.logger import logger


def login_command():
    """触发登录流程"""
    try:
        browser_login = BrowserLogin()
        cookies = browser_login.trigger_login()

        if cookies:
            print("\n✅ 登录流程完成！")
            return 0
        else:
            print("\n❌ 登录流程失败")
            return 1

    except KeyboardInterrupt:
        print("\n\n⚠️  用户中断登录流程")
        logger.info("用户中断登录")
        return 1
    except Exception as e:
        logger.error(f"登录命令执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


def status_command():
    """查看Cookie状态"""
    cookie_manager = CookieManager()

    print("\n" + "="*60)
    print("📊 Cookie状态")
    print("="*60)

    if cookie_manager.is_cookie_exists():
        cookie_data = cookie_manager.load_cookies()
        if cookie_data:
            metadata = cookie_data.get("metadata", {})
            cookies = cookie_data.get("cookies", [])

            print(f"✅ Cookie文件存在")
            print(f"📁 文件路径: {cookie_manager.cookie_file}")
            print(f"📦 Cookie数量: {len(cookies)}")
            print(f"👤 用户名: {metadata.get('username', 'Unknown')}")
            print(f"🕐 创建时间: {metadata.get('created_at', 'Unknown')}")
            print(f"🕐 最后验证: {metadata.get('last_validated', 'Unknown')}")
            print(f"🔗 登录URL: {metadata.get('url', 'Unknown')}")
        else:
            print("❌ Cookie文件存在但加载失败")
    else:
        print("❌ Cookie文件不存在")
        print("💡 提示: 运行 'python main.py login' 进行登录")

    print("="*60 + "\n")
    return 0


def crawl_command(args):
    """抓取排行榜数据"""
    try:
        crawler_service = CrawlerService()

        # 确定是否抓取粉丝数据
        fetch_fans = args.fetch_fans if hasattr(args, 'fetch_fans') else True

        filepath = crawler_service.crawl(
            rank_type=args.rank_type,
            size=args.size,
            filter_type=args.filter_type,
            fetch_fans=fetch_fans
        )

        if filepath:
            return 0
        else:
            return 1

    except KeyboardInterrupt:
        print("\n\n⚠️  用户中断抓取流程")
        logger.info("用户中断抓取")
        return 1
    except Exception as e:
        logger.error(f"抓取命令执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


def daemon_command(args):
    """循环运行模式（守护进程）"""
    try:
        # 解析配置
        interval_no_fans = args.interval_no_fans if args.interval_no_fans else DAEMON_INTERVAL_NO_FANS_HOURS
        interval_with_fans = args.interval_with_fans if args.interval_with_fans else DAEMON_INTERVAL_WITH_FANS_HOURS

        # 转换为秒
        interval_no_fans_seconds = interval_no_fans * 3600
        interval_with_fans_seconds = interval_with_fans * 3600

        print("\n" + "="*60)
        print("🔄 启动循环运行模式（守护进程）")
        print(f"⏰ 不包含粉丝数据抓取间隔: {interval_no_fans} 小时")
        print(f"⏰ 包含粉丝数据抓取间隔: {interval_with_fans} 小时")
        print("="*60)
        print("💡 提示: 按 Ctrl+C 停止运行\n")

        logger.info(f"守护进程启动 - 不包含粉丝间隔: {interval_no_fans}小时, 包含粉丝间隔: {interval_with_fans}小时")

        # 初始化时间追踪
        last_no_fans_time = datetime.now() - timedelta(seconds=interval_no_fans_seconds)  # 立即执行第一次
        last_with_fans_time = datetime.now() - timedelta(seconds=interval_with_fans_seconds)  # 立即执行第一次

        cycle_count = 0
        consecutive_failures = 0  # 连续失败次数
        max_consecutive_failures = 3  # 最大连续失败次数，超过后给出警告

        while True:
            current_time = datetime.now()

            # 检查是否需要抓取不包含粉丝的数据
            time_since_no_fans = (current_time - last_no_fans_time).total_seconds()
            need_no_fans = time_since_no_fans >= interval_no_fans_seconds

            # 检查是否需要抓取包含粉丝的数据
            time_since_with_fans = (current_time - last_with_fans_time).total_seconds()
            need_with_fans = time_since_with_fans >= interval_with_fans_seconds

            if need_no_fans or need_with_fans:
                cycle_count += 1  # 只在执行抓取时增加计数

                # 确定本次抓取是否包含粉丝
                fetch_fans = need_with_fans  # 如果到了包含粉丝的时间，优先抓取包含粉丝的数据

                crawler_service = None
                try:
                    # 创建新的服务实例，避免内存泄漏
                    crawler_service = CrawlerService()

                    # 使用静默模式，减少控制台输出
                    print(f"\n[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] 开始第 {cycle_count} 次抓取 ({'包含' if fetch_fans else '不包含'}粉丝数据)")
                    logger.info(f"开始第 {cycle_count} 次抓取 - 包含粉丝: {fetch_fans}")

                    filepath = crawler_service.crawl(
                        rank_type=args.rank_type,
                        size=args.size,
                        filter_type=args.filter_type,
                        fetch_fans=fetch_fans
                    )

                    if filepath:
                        print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ✅ 抓取完成: {filepath.name}")
                        logger.info(f"第 {cycle_count} 次抓取成功: {filepath.name}")

                        # 重置连续失败计数
                        consecutive_failures = 0

                        # 更新最后执行时间
                        if fetch_fans:
                            last_with_fans_time = current_time
                            last_no_fans_time = current_time  # 包含粉丝的抓取也满足不包含粉丝的需求
                        else:
                            last_no_fans_time = current_time
                    else:
                        consecutive_failures += 1
                        print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取失败 (连续失败 {consecutive_failures} 次)")
                        logger.error(f"第 {cycle_count} 次抓取失败 (连续失败 {consecutive_failures} 次)")

                        # 如果连续失败次数过多，给出警告
                        if consecutive_failures >= max_consecutive_failures:
                            print(f"\n⚠️  警告: 已连续失败 {consecutive_failures} 次！")
                            print("   可能的原因：")
                            print("   1. Cookie已失效，需要重新登录")
                            print("   2. 网络连接问题")
                            print("   3. API服务异常")
                            print(f"\n   建议：运行 'python main.py login' 检查并更新Cookie")
                            print("   程序将继续运行，等待下次抓取...\n")
                            logger.warning(f"连续失败 {consecutive_failures} 次，可能Cookie已失效")

                        # 即使失败也更新时间，避免重复失败
                        if fetch_fans:
                            last_with_fans_time = current_time
                        last_no_fans_time = current_time

                except Exception as e:
                    consecutive_failures += 1
                    error_msg = str(e)[:100]
                    print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取异常: {error_msg} (连续失败 {consecutive_failures} 次)")
                    logger.error(f"第 {cycle_count} 次抓取异常: {e} (连续失败 {consecutive_failures} 次)", exc_info=True)

                    # 如果连续失败次数过多，给出警告
                    if consecutive_failures >= max_consecutive_failures:
                        print(f"\n⚠️  警告: 已连续失败 {consecutive_failures} 次！")
                        print("   可能的原因：")
                        print("   1. Cookie已失效，需要重新登录")
                        print("   2. 网络连接问题")
                        print("   3. API服务异常")
                        print(f"\n   建议：运行 'python main.py login' 检查并更新Cookie")
                        print("   程序将继续运行，等待下次抓取...\n")
                        logger.warning(f"连续失败 {consecutive_failures} 次，可能Cookie已失效")

                    # 即使异常也更新时间，避免重复异常
                    if fetch_fans:
                        last_with_fans_time = current_time
                    last_no_fans_time = current_time

                finally:
                    # 清理资源，防止内存泄漏
                    if crawler_service is not None:
                        del crawler_service
                    gc.collect()

            # 计算下次检查时间（取两个间隔中较小的）
            check_time = datetime.now()
            next_check_seconds = min(
                max(0, interval_no_fans_seconds - (check_time - last_no_fans_time).total_seconds()),
                max(0, interval_with_fans_seconds - (check_time - last_with_fans_time).total_seconds())
            )

            # 如果两个都到了，立即执行，否则等待
            if next_check_seconds > 0:
                # 显示下次执行时间（只在开始时显示一次，减少输出）
                next_time = check_time + timedelta(seconds=next_check_seconds)
                print(f"[{check_time.strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')}", end='\r')

                # 分段等待，每30秒更新一次显示，避免控制台输出过多
                wait_interval = 30  # 每30秒更新一次显示
                waited = 0
                while waited < next_check_seconds:
                    sleep_time = min(wait_interval, next_check_seconds - waited)
                    time.sleep(sleep_time)
                    waited += sleep_time

                    # 每30秒更新一次显示，或者剩余时间少于30秒时更新
                    if waited < next_check_seconds:
                        remaining = next_check_seconds - waited
                        next_time = datetime.now() + timedelta(seconds=remaining)
                        # 只在剩余时间变化较大时更新显示，减少刷新频率
                        if remaining % 60 < 30 or remaining < 60:  # 每分钟更新一次，或剩余时间少于1分钟时频繁更新
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')} ({int(remaining//60)}分{int(remaining%60)}秒)", end='\r')

            print()  # 换行

    except KeyboardInterrupt:
        print("\n\n⚠️  用户中断守护进程")
        logger.info("守护进程被用户中断")
        return 0
    except Exception as e:
        logger.error(f"守护进程执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


def main():
    """主函数"""
    parser = argparse.ArgumentParser(
        description='字节跳动直播服务平台数据抓取工具',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
示例:
  python main.py login      # 触发登录流程
  python main.py status     # 查看Cookie状态
        """
    )

    subparsers = parser.add_subparsers(dest='command', help='可用命令')

    # login命令
    login_parser = subparsers.add_parser('login', help='触发登录流程')

    # status命令
    status_parser = subparsers.add_parser('status', help='查看Cookie状态')

    # crawl命令
    crawl_parser = subparsers.add_parser('crawl', help='抓取排行榜数据')
    crawl_parser.add_argument('--rank-type', type=int, default=0, help='排行类型，0=总榜')
    crawl_parser.add_argument('--size', type=int, default=10, help='每页数量')
    crawl_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')

    # 粉丝数据抓取选项（使用互斥组）
    fans_group = crawl_parser.add_mutually_exclusive_group()
    fans_group.add_argument('--fetch-fans', action='store_true', default=True, dest='fetch_fans', help='抓取粉丝数据（默认）')
    fans_group.add_argument('--no-fetch-fans', action='store_false', dest='fetch_fans', help='不抓取粉丝数据')

    # daemon命令（循环运行模式）
    daemon_parser = subparsers.add_parser('daemon', help='循环运行模式（守护进程），定时自动抓取数据')
    daemon_parser.add_argument('--rank-type', type=int, default=0, help='排行类型，0=总榜')
    daemon_parser.add_argument('--size', type=int, default=10, help='每页数量')
    daemon_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')
    daemon_parser.add_argument('--interval-no-fans', type=float, default=None,
                               help=f'不包含粉丝数据的抓取间隔（小时，默认: {DAEMON_INTERVAL_NO_FANS_HOURS}）')
    daemon_parser.add_argument('--interval-with-fans', type=float, default=None,
                               help=f'包含粉丝数据的抓取间隔（小时，默认: {DAEMON_INTERVAL_WITH_FANS_HOURS}）')

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return 1

    try:
        if args.command == 'login':
            return login_command()
        elif args.command == 'status':
            return status_command()
        elif args.command == 'crawl':
            return crawl_command(args)
        elif args.command == 'daemon':
            return daemon_command(args)
        else:
            parser.print_help()
            return 1
    except Exception as e:
        logger.error(f"程序执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


if __name__ == '__main__':
    sys.exit(main())