live-forum/server/crawler/commerce/main.py
2026-03-24 11:27:37 +08:00

349 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
主程序入口
提供命令行接口
"""
import sys
import argparse
import time
import gc
from datetime import datetime, timedelta
from core.browser_login import BrowserLogin
from core.cookie_manager import CookieManager
from services.crawler_service import CrawlerService
from services.daren_account_service import DarenAccountService
from config.settings import DAEMON_INTERVAL_HOURS
from utils.logger import logger
def login_command():
"""触发登录流程"""
try:
browser_login = BrowserLogin()
cookies = browser_login.trigger_login()
if cookies:
print("\n✅ 登录流程完成!")
return 0
else:
print("\n❌ 登录流程失败")
return 1
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断登录流程")
logger.info("用户中断登录")
return 1
except Exception as e:
logger.error(f"登录命令执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
def status_command():
"""查看Cookie状态"""
cookie_manager = CookieManager()
print("\n" + "="*60)
print("📊 Cookie状态")
print("="*60)
if cookie_manager.is_cookie_exists():
cookie_data = cookie_manager.load_cookies()
if cookie_data:
metadata = cookie_data.get("metadata", {})
cookies = cookie_data.get("cookies", [])
print(f"✅ Cookie文件存在")
print(f"📁 文件路径: {cookie_manager.cookie_file}")
print(f"📦 Cookie数量: {len(cookies)}")
print(f"👤 用户名: {metadata.get('username', 'Unknown')}")
print(f"🕐 创建时间: {metadata.get('created_at', 'Unknown')}")
print(f"🕐 最后验证: {metadata.get('last_validated', 'Unknown')}")
print(f"🔗 登录URL: {metadata.get('url', 'Unknown')}")
else:
print("❌ Cookie文件存在但加载失败")
else:
print("❌ Cookie文件不存在")
print("💡 提示: 运行 'python main.py login' 进行登录")
print("="*60 + "\n")
return 0
def crawl_command(args):
"""抓取排行榜数据"""
try:
crawler_service = CrawlerService()
# 确定是否抓取粉丝数据
fetch_fans = args.fetch_fans if hasattr(args, 'fetch_fans') else True
filepath = crawler_service.crawl(
rank_type=args.rank_type,
size=args.size,
filter_type=args.filter_type,
fetch_fans=fetch_fans
)
if filepath:
return 0
else:
return 1
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断抓取流程")
logger.info("用户中断抓取")
return 1
except Exception as e:
logger.error(f"抓取命令执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
def fetch_daren_command(args):
"""抓取达人账号数据"""
try:
service = DarenAccountService()
page_size = args.page_size if hasattr(args, 'page_size') else 20
status = args.status if hasattr(args, 'status') else 1
filepath = service.crawl(page_size=page_size, status=status)
if filepath:
print(f"\n✅ 数据抓取成功!")
print(f"📁 文件路径: {filepath}")
return 0
else:
print(f"\n❌ 数据抓取失败")
return 1
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断抓取")
logger.info("用户中断抓取")
return 1
except Exception as e:
logger.error(f"抓取命令执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
def daemon_command(args):
"""循环运行模式(守护进程)- 定时抓取达人账号数据"""
try:
# 解析配置
interval_hours = args.interval if hasattr(args, 'interval') and args.interval else DAEMON_INTERVAL_HOURS
interval_seconds = interval_hours * 3600
page_size = args.page_size if hasattr(args, 'page_size') else 20
status = args.status if hasattr(args, 'status') else 1
print("\n" + "="*60)
print("🔄 启动循环运行模式(守护进程)")
print(f"📊 数据类型: 达人账号数据")
print(f"⏰ 抓取间隔: {interval_hours} 小时")
print(f"📄 每页数量: {page_size}")
print("="*60)
print("💡 提示: 按 Ctrl+C 停止运行\n")
logger.info(f"守护进程启动 - 抓取间隔: {interval_hours}小时, 每页数量: {page_size}")
# 初始化时间追踪(立即执行第一次)
last_crawl_time = datetime.now() - timedelta(seconds=interval_seconds)
cycle_count = 0
consecutive_failures = 0 # 连续失败次数
max_consecutive_failures = 3 # 最大连续失败次数
while True:
current_time = datetime.now()
# 检查是否需要抓取
time_since_last = (current_time - last_crawl_time).total_seconds()
need_crawl = time_since_last >= interval_seconds
if need_crawl:
cycle_count += 1
service = None
try:
# 创建新的服务实例,避免内存泄漏
service = DarenAccountService()
print(f"\n[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] 🚀 开始第 {cycle_count} 次抓取")
logger.info(f"开始第 {cycle_count} 次抓取达人账号数据")
# 执行抓取
filepath = service.crawl(page_size=page_size, status=status)
if filepath:
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ✅ 抓取完成: {filepath.name}")
logger.info(f"{cycle_count} 次抓取成功: {filepath.name}")
# 重置连续失败计数
consecutive_failures = 0
# 更新最后执行时间
last_crawl_time = current_time
else:
consecutive_failures += 1
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取失败 (连续失败 {consecutive_failures} 次)")
logger.error(f"{cycle_count} 次抓取失败 (连续失败 {consecutive_failures} 次)")
# 如果连续失败次数过多,给出警告
if consecutive_failures >= max_consecutive_failures:
print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!")
print(" 可能的原因:")
print(" 1. Cookie已失效需要重新登录")
print(" 2. 网络连接问题")
print(" 3. API服务异常")
print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie")
print(" 程序将继续运行,等待下次抓取...\n")
logger.warning(f"连续失败 {consecutive_failures}可能Cookie已失效")
# 即使失败也更新时间,避免重复失败
last_crawl_time = current_time
except Exception as e:
consecutive_failures += 1
error_msg = str(e)[:100]
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取异常: {error_msg} (连续失败 {consecutive_failures} 次)")
logger.error(f"{cycle_count} 次抓取异常: {e} (连续失败 {consecutive_failures} 次)", exc_info=True)
# 如果连续失败次数过多,给出警告
if consecutive_failures >= max_consecutive_failures:
print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!")
print(" 可能的原因:")
print(" 1. Cookie已失效需要重新登录")
print(" 2. 网络连接问题")
print(" 3. 程序异常")
print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie")
print(" 程序将继续运行,等待下次抓取...\n")
logger.warning(f"连续失败 {consecutive_failures}可能Cookie已失效")
# 即使异常也更新时间,避免重复异常
last_crawl_time = current_time
finally:
# 清理资源,防止内存泄漏
if service is not None:
del service
gc.collect()
# 计算下次抓取时间
check_time = datetime.now()
next_check_seconds = max(0, interval_seconds - (check_time - last_crawl_time).total_seconds())
if next_check_seconds > 0:
# 显示下次执行时间
next_time = check_time + timedelta(seconds=next_check_seconds)
print(f"[{check_time.strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')}", end='\r')
# 分段等待每30秒更新一次显示
wait_interval = 30
waited = 0
while waited < next_check_seconds:
sleep_time = min(wait_interval, next_check_seconds - waited)
time.sleep(sleep_time)
waited += sleep_time
# 更新显示
if waited < next_check_seconds:
remaining = next_check_seconds - waited
next_time = datetime.now() + timedelta(seconds=remaining)
# 每分钟更新一次显示
if remaining % 60 < 30 or remaining < 60:
hours = int(remaining // 3600)
minutes = int((remaining % 3600) // 60)
seconds = int(remaining % 60)
time_str = f"{hours}小时{minutes}{seconds}" if hours > 0 else f"{minutes}{seconds}"
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')} (剩余 {time_str})", end='\r')
print() # 换行
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断守护进程")
logger.info("守护进程被用户中断")
return 0
except Exception as e:
logger.error(f"守护进程执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='巨量百应 - MCN机构数据抓取工具',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
python main.py login # 触发登录流程
python main.py status # 查看Cookie状态
python main.py fetch-daren # 抓取达人账号数据(单次)
python main.py fetch-daren --page-size 50 # 自定义每页数量
python main.py daemon # 守护进程模式默认1小时抓取一次
python main.py daemon --interval 2 # 自定义抓取间隔2小时
"""
)
subparsers = parser.add_subparsers(dest='command', help='可用命令')
# login命令
login_parser = subparsers.add_parser('login', help='触发登录流程')
# status命令
status_parser = subparsers.add_parser('status', help='查看Cookie状态')
# crawl命令
crawl_parser = subparsers.add_parser('crawl', help='抓取排行榜数据')
crawl_parser.add_argument('--rank-type', type=int, default=0, help='排行类型0=总榜')
crawl_parser.add_argument('--size', type=int, default=10, help='每页数量')
crawl_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')
# 粉丝数据抓取选项(使用互斥组)
fans_group = crawl_parser.add_mutually_exclusive_group()
fans_group.add_argument('--fetch-fans', action='store_true', default=True, dest='fetch_fans', help='抓取粉丝数据(默认)')
fans_group.add_argument('--no-fetch-fans', action='store_false', dest='fetch_fans', help='不抓取粉丝数据')
# fetch-daren命令抓取达人账号数据
fetch_daren_parser = subparsers.add_parser('fetch-daren', help='抓取MCN机构绑定的达人账号数据')
fetch_daren_parser.add_argument('--page-size', type=int, default=20, help='每页数量默认20')
fetch_daren_parser.add_argument('--status', type=int, default=1, help='状态1=全部默认1')
# daemon命令循环运行模式
daemon_parser = subparsers.add_parser('daemon', help='循环运行模式(守护进程),定时自动抓取达人账号数据')
daemon_parser.add_argument('--interval', type=float, default=None,
help=f'抓取间隔(小时,默认: {DAEMON_INTERVAL_HOURS}')
daemon_parser.add_argument('--page-size', type=int, default=20, help='每页数量默认20')
daemon_parser.add_argument('--status', type=int, default=1, help='状态默认1=全部)')
args = parser.parse_args()
if not args.command:
parser.print_help()
return 1
try:
if args.command == 'login':
return login_command()
elif args.command == 'status':
return status_command()
elif args.command == 'crawl':
return crawl_command(args)
elif args.command == 'fetch-daren':
return fetch_daren_command(args)
elif args.command == 'daemon':
return daemon_command(args)
else:
parser.print_help()
return 1
except Exception as e:
logger.error(f"程序执行失败: {e}", exc_info=True)
print(f"\n❌ 发生错误: {e}")
return 1
if __name__ == '__main__':
sys.exit(main())