live-forum/server/crawler/commerce/main.py

"""
主程序入口
提供命令行接口
"""
import sys
import argparse
import time
import gc
from datetime import datetime, timedelta
from core.browser_login import BrowserLogin
from core.cookie_manager import CookieManager
from services.crawler_service import CrawlerService
from services.daren_account_service import DarenAccountService
from config.settings import DAEMON_INTERVAL_HOURS
from utils.logger import logger


def login_command():
    """触发登录流程"""
    try:
        browser_login = BrowserLogin()
        cookies = browser_login.trigger_login()

        if cookies:
            print("\n✅ 登录流程完成！")
            return 0
        else:
            print("\n❌ 登录流程失败")
            return 1

    except KeyboardInterrupt:
        print("\n\n⚠️  用户中断登录流程")
        logger.info("用户中断登录")
        return 1
    except Exception as e:
        logger.error(f"登录命令执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


def status_command():
    """查看Cookie状态"""
    cookie_manager = CookieManager()

    print("\n" + "="*60)
    print("📊 Cookie状态")
    print("="*60)

    if cookie_manager.is_cookie_exists():
        cookie_data = cookie_manager.load_cookies()
        if cookie_data:
            metadata = cookie_data.get("metadata", {})
            cookies = cookie_data.get("cookies", [])

            print(f"✅ Cookie文件存在")
            print(f"📁 文件路径: {cookie_manager.cookie_file}")
            print(f"📦 Cookie数量: {len(cookies)}")
            print(f"👤 用户名: {metadata.get('username', 'Unknown')}")
            print(f"🕐 创建时间: {metadata.get('created_at', 'Unknown')}")
            print(f"🕐 最后验证: {metadata.get('last_validated', 'Unknown')}")
            print(f"🔗 登录URL: {metadata.get('url', 'Unknown')}")
        else:
            print("❌ Cookie文件存在但加载失败")
    else:
        print("❌ Cookie文件不存在")
        print("💡 提示: 运行 'python main.py login' 进行登录")

    print("="*60 + "\n")
    return 0


def crawl_command(args):
    """抓取排行榜数据"""
    try:
        crawler_service = CrawlerService()

        # 确定是否抓取粉丝数据
        fetch_fans = args.fetch_fans if hasattr(args, 'fetch_fans') else True

        filepath = crawler_service.crawl(
            rank_type=args.rank_type,
            size=args.size,
            filter_type=args.filter_type,
            fetch_fans=fetch_fans
        )

        if filepath:
            return 0
        else:
            return 1

    except KeyboardInterrupt:
        print("\n\n⚠️  用户中断抓取流程")
        logger.info("用户中断抓取")
        return 1
    except Exception as e:
        logger.error(f"抓取命令执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


def fetch_daren_command(args):
    """抓取达人账号数据"""
    try:
        service = DarenAccountService()

        page_size = args.page_size if hasattr(args, 'page_size') else 20
        status = args.status if hasattr(args, 'status') else 1

        filepath = service.crawl(page_size=page_size, status=status)

        if filepath:
            print(f"\n✅ 数据抓取成功！")
            print(f"📁 文件路径: {filepath}")
            return 0
        else:
            print(f"\n❌ 数据抓取失败")
            return 1

    except KeyboardInterrupt:
        print("\n\n⚠️  用户中断抓取")
        logger.info("用户中断抓取")
        return 1
    except Exception as e:
        logger.error(f"抓取命令执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


def daemon_command(args):
    """循环运行模式（守护进程）- 定时抓取达人账号数据"""
    try:
        # 解析配置
        interval_hours = args.interval if hasattr(args, 'interval') and args.interval else DAEMON_INTERVAL_HOURS
        interval_seconds = interval_hours * 3600

        page_size = args.page_size if hasattr(args, 'page_size') else 20
        status = args.status if hasattr(args, 'status') else 1

        print("\n" + "="*60)
        print("🔄 启动循环运行模式（守护进程）")
        print(f"📊 数据类型: 达人账号数据")
        print(f"⏰ 抓取间隔: {interval_hours} 小时")
        print(f"📄 每页数量: {page_size}")
        print("="*60)
        print("💡 提示: 按 Ctrl+C 停止运行\n")

        logger.info(f"守护进程启动 - 抓取间隔: {interval_hours}小时, 每页数量: {page_size}")

        # 初始化时间追踪（立即执行第一次）
        last_crawl_time = datetime.now() - timedelta(seconds=interval_seconds)

        cycle_count = 0
        consecutive_failures = 0  # 连续失败次数
        max_consecutive_failures = 3  # 最大连续失败次数

        while True:
            current_time = datetime.now()

            # 检查是否需要抓取
            time_since_last = (current_time - last_crawl_time).total_seconds()
            need_crawl = time_since_last >= interval_seconds

            if need_crawl:
                cycle_count += 1

                service = None
                try:
                    # 创建新的服务实例，避免内存泄漏
                    service = DarenAccountService()

                    print(f"\n[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] 🚀 开始第 {cycle_count} 次抓取")
                    logger.info(f"开始第 {cycle_count} 次抓取达人账号数据")

                    # 执行抓取
                    filepath = service.crawl(page_size=page_size, status=status)

                    if filepath:
                        print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ✅ 抓取完成: {filepath.name}")
                        logger.info(f"第 {cycle_count} 次抓取成功: {filepath.name}")

                        # 重置连续失败计数
                        consecutive_failures = 0

                        # 更新最后执行时间
                        last_crawl_time = current_time
                    else:
                        consecutive_failures += 1
                        print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取失败 (连续失败 {consecutive_failures} 次)")
                        logger.error(f"第 {cycle_count} 次抓取失败 (连续失败 {consecutive_failures} 次)")

                        # 如果连续失败次数过多，给出警告
                        if consecutive_failures >= max_consecutive_failures:
                            print(f"\n⚠️  警告: 已连续失败 {consecutive_failures} 次！")
                            print("   可能的原因：")
                            print("   1. Cookie已失效，需要重新登录")
                            print("   2. 网络连接问题")
                            print("   3. API服务异常")
                            print(f"\n   建议：运行 'python main.py login' 检查并更新Cookie")
                            print("   程序将继续运行，等待下次抓取...\n")
                            logger.warning(f"连续失败 {consecutive_failures} 次，可能Cookie已失效")

                        # 即使失败也更新时间，避免重复失败
                        last_crawl_time = current_time

                except Exception as e:
                    consecutive_failures += 1
                    error_msg = str(e)[:100]
                    print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取异常: {error_msg} (连续失败 {consecutive_failures} 次)")
                    logger.error(f"第 {cycle_count} 次抓取异常: {e} (连续失败 {consecutive_failures} 次)", exc_info=True)

                    # 如果连续失败次数过多，给出警告
                    if consecutive_failures >= max_consecutive_failures:
                        print(f"\n⚠️  警告: 已连续失败 {consecutive_failures} 次！")
                        print("   可能的原因：")
                        print("   1. Cookie已失效，需要重新登录")
                        print("   2. 网络连接问题")
                        print("   3. 程序异常")
                        print(f"\n   建议：运行 'python main.py login' 检查并更新Cookie")
                        print("   程序将继续运行，等待下次抓取...\n")
                        logger.warning(f"连续失败 {consecutive_failures} 次，可能Cookie已失效")

                    # 即使异常也更新时间，避免重复异常
                    last_crawl_time = current_time

                finally:
                    # 清理资源，防止内存泄漏
                    if service is not None:
                        del service
                    gc.collect()

            # 计算下次抓取时间
            check_time = datetime.now()
            next_check_seconds = max(0, interval_seconds - (check_time - last_crawl_time).total_seconds())

            if next_check_seconds > 0:
                # 显示下次执行时间
                next_time = check_time + timedelta(seconds=next_check_seconds)
                print(f"[{check_time.strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')}", end='\r')

                # 分段等待，每30秒更新一次显示
                wait_interval = 30
                waited = 0
                while waited < next_check_seconds:
                    sleep_time = min(wait_interval, next_check_seconds - waited)
                    time.sleep(sleep_time)
                    waited += sleep_time

                    # 更新显示
                    if waited < next_check_seconds:
                        remaining = next_check_seconds - waited
                        next_time = datetime.now() + timedelta(seconds=remaining)
                        # 每分钟更新一次显示
                        if remaining % 60 < 30 or remaining < 60:
                            hours = int(remaining // 3600)
                            minutes = int((remaining % 3600) // 60)
                            seconds = int(remaining % 60)
                            time_str = f"{hours}小时{minutes}分{seconds}秒" if hours > 0 else f"{minutes}分{seconds}秒"
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')} (剩余 {time_str})", end='\r')

            print()  # 换行

    except KeyboardInterrupt:
        print("\n\n⚠️  用户中断守护进程")
        logger.info("守护进程被用户中断")
        return 0
    except Exception as e:
        logger.error(f"守护进程执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


def main():
    """主函数"""
    parser = argparse.ArgumentParser(
        description='巨量百应 - MCN机构数据抓取工具',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
示例:
  python main.py login                      # 触发登录流程
  python main.py status                     # 查看Cookie状态
  python main.py fetch-daren                # 抓取达人账号数据（单次）
  python main.py fetch-daren --page-size 50 # 自定义每页数量
  python main.py daemon                     # 守护进程模式（默认1小时抓取一次）
  python main.py daemon --interval 2        # 自定义抓取间隔（2小时）
        """
    )

    subparsers = parser.add_subparsers(dest='command', help='可用命令')

    # login命令
    login_parser = subparsers.add_parser('login', help='触发登录流程')

    # status命令
    status_parser = subparsers.add_parser('status', help='查看Cookie状态')

    # crawl命令
    crawl_parser = subparsers.add_parser('crawl', help='抓取排行榜数据')
    crawl_parser.add_argument('--rank-type', type=int, default=0, help='排行类型，0=总榜')
    crawl_parser.add_argument('--size', type=int, default=10, help='每页数量')
    crawl_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')

    # 粉丝数据抓取选项（使用互斥组）
    fans_group = crawl_parser.add_mutually_exclusive_group()
    fans_group.add_argument('--fetch-fans', action='store_true', default=True, dest='fetch_fans', help='抓取粉丝数据（默认）')
    fans_group.add_argument('--no-fetch-fans', action='store_false', dest='fetch_fans', help='不抓取粉丝数据')

    # fetch-daren命令（抓取达人账号数据）
    fetch_daren_parser = subparsers.add_parser('fetch-daren', help='抓取MCN机构绑定的达人账号数据')
    fetch_daren_parser.add_argument('--page-size', type=int, default=20, help='每页数量（默认20）')
    fetch_daren_parser.add_argument('--status', type=int, default=1, help='状态（1=全部，默认1）')

    # daemon命令（循环运行模式）
    daemon_parser = subparsers.add_parser('daemon', help='循环运行模式（守护进程），定时自动抓取达人账号数据')
    daemon_parser.add_argument('--interval', type=float, default=None,
                               help=f'抓取间隔（小时，默认: {DAEMON_INTERVAL_HOURS}）')
    daemon_parser.add_argument('--page-size', type=int, default=20, help='每页数量（默认20）')
    daemon_parser.add_argument('--status', type=int, default=1, help='状态（默认1=全部）')

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return 1

    try:
        if args.command == 'login':
            return login_command()
        elif args.command == 'status':
            return status_command()
        elif args.command == 'crawl':
            return crawl_command(args)
        elif args.command == 'fetch-daren':
            return fetch_daren_command(args)
        elif args.command == 'daemon':
            return daemon_command(args)
        else:
            parser.print_help()
            return 1
    except Exception as e:
        logger.error(f"程序执行失败: {e}", exc_info=True)
        print(f"\n❌ 发生错误: {e}")
        return 1


if __name__ == '__main__':
    sys.exit(main())