349 lines
15 KiB
Python
349 lines
15 KiB
Python
"""
|
||
主程序入口
|
||
提供命令行接口
|
||
"""
|
||
import sys
|
||
import argparse
|
||
import time
|
||
import gc
|
||
from datetime import datetime, timedelta
|
||
from core.browser_login import BrowserLogin
|
||
from core.cookie_manager import CookieManager
|
||
from services.crawler_service import CrawlerService
|
||
from services.daren_account_service import DarenAccountService
|
||
from config.settings import DAEMON_INTERVAL_HOURS
|
||
from utils.logger import logger
|
||
|
||
|
||
def login_command():
|
||
"""触发登录流程"""
|
||
try:
|
||
browser_login = BrowserLogin()
|
||
cookies = browser_login.trigger_login()
|
||
|
||
if cookies:
|
||
print("\n✅ 登录流程完成!")
|
||
return 0
|
||
else:
|
||
print("\n❌ 登录流程失败")
|
||
return 1
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\n⚠️ 用户中断登录流程")
|
||
logger.info("用户中断登录")
|
||
return 1
|
||
except Exception as e:
|
||
logger.error(f"登录命令执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
def status_command():
|
||
"""查看Cookie状态"""
|
||
cookie_manager = CookieManager()
|
||
|
||
print("\n" + "="*60)
|
||
print("📊 Cookie状态")
|
||
print("="*60)
|
||
|
||
if cookie_manager.is_cookie_exists():
|
||
cookie_data = cookie_manager.load_cookies()
|
||
if cookie_data:
|
||
metadata = cookie_data.get("metadata", {})
|
||
cookies = cookie_data.get("cookies", [])
|
||
|
||
print(f"✅ Cookie文件存在")
|
||
print(f"📁 文件路径: {cookie_manager.cookie_file}")
|
||
print(f"📦 Cookie数量: {len(cookies)}")
|
||
print(f"👤 用户名: {metadata.get('username', 'Unknown')}")
|
||
print(f"🕐 创建时间: {metadata.get('created_at', 'Unknown')}")
|
||
print(f"🕐 最后验证: {metadata.get('last_validated', 'Unknown')}")
|
||
print(f"🔗 登录URL: {metadata.get('url', 'Unknown')}")
|
||
else:
|
||
print("❌ Cookie文件存在但加载失败")
|
||
else:
|
||
print("❌ Cookie文件不存在")
|
||
print("💡 提示: 运行 'python main.py login' 进行登录")
|
||
|
||
print("="*60 + "\n")
|
||
return 0
|
||
|
||
|
||
def crawl_command(args):
|
||
"""抓取排行榜数据"""
|
||
try:
|
||
crawler_service = CrawlerService()
|
||
|
||
# 确定是否抓取粉丝数据
|
||
fetch_fans = args.fetch_fans if hasattr(args, 'fetch_fans') else True
|
||
|
||
filepath = crawler_service.crawl(
|
||
rank_type=args.rank_type,
|
||
size=args.size,
|
||
filter_type=args.filter_type,
|
||
fetch_fans=fetch_fans
|
||
)
|
||
|
||
if filepath:
|
||
return 0
|
||
else:
|
||
return 1
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\n⚠️ 用户中断抓取流程")
|
||
logger.info("用户中断抓取")
|
||
return 1
|
||
except Exception as e:
|
||
logger.error(f"抓取命令执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
def fetch_daren_command(args):
|
||
"""抓取达人账号数据"""
|
||
try:
|
||
service = DarenAccountService()
|
||
|
||
page_size = args.page_size if hasattr(args, 'page_size') else 20
|
||
status = args.status if hasattr(args, 'status') else 1
|
||
|
||
filepath = service.crawl(page_size=page_size, status=status)
|
||
|
||
if filepath:
|
||
print(f"\n✅ 数据抓取成功!")
|
||
print(f"📁 文件路径: {filepath}")
|
||
return 0
|
||
else:
|
||
print(f"\n❌ 数据抓取失败")
|
||
return 1
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\n⚠️ 用户中断抓取")
|
||
logger.info("用户中断抓取")
|
||
return 1
|
||
except Exception as e:
|
||
logger.error(f"抓取命令执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
def daemon_command(args):
|
||
"""循环运行模式(守护进程)- 定时抓取达人账号数据"""
|
||
try:
|
||
# 解析配置
|
||
interval_hours = args.interval if hasattr(args, 'interval') and args.interval else DAEMON_INTERVAL_HOURS
|
||
interval_seconds = interval_hours * 3600
|
||
|
||
page_size = args.page_size if hasattr(args, 'page_size') else 20
|
||
status = args.status if hasattr(args, 'status') else 1
|
||
|
||
print("\n" + "="*60)
|
||
print("🔄 启动循环运行模式(守护进程)")
|
||
print(f"📊 数据类型: 达人账号数据")
|
||
print(f"⏰ 抓取间隔: {interval_hours} 小时")
|
||
print(f"📄 每页数量: {page_size}")
|
||
print("="*60)
|
||
print("💡 提示: 按 Ctrl+C 停止运行\n")
|
||
|
||
logger.info(f"守护进程启动 - 抓取间隔: {interval_hours}小时, 每页数量: {page_size}")
|
||
|
||
# 初始化时间追踪(立即执行第一次)
|
||
last_crawl_time = datetime.now() - timedelta(seconds=interval_seconds)
|
||
|
||
cycle_count = 0
|
||
consecutive_failures = 0 # 连续失败次数
|
||
max_consecutive_failures = 3 # 最大连续失败次数
|
||
|
||
while True:
|
||
current_time = datetime.now()
|
||
|
||
# 检查是否需要抓取
|
||
time_since_last = (current_time - last_crawl_time).total_seconds()
|
||
need_crawl = time_since_last >= interval_seconds
|
||
|
||
if need_crawl:
|
||
cycle_count += 1
|
||
|
||
service = None
|
||
try:
|
||
# 创建新的服务实例,避免内存泄漏
|
||
service = DarenAccountService()
|
||
|
||
print(f"\n[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] 🚀 开始第 {cycle_count} 次抓取")
|
||
logger.info(f"开始第 {cycle_count} 次抓取达人账号数据")
|
||
|
||
# 执行抓取
|
||
filepath = service.crawl(page_size=page_size, status=status)
|
||
|
||
if filepath:
|
||
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ✅ 抓取完成: {filepath.name}")
|
||
logger.info(f"第 {cycle_count} 次抓取成功: {filepath.name}")
|
||
|
||
# 重置连续失败计数
|
||
consecutive_failures = 0
|
||
|
||
# 更新最后执行时间
|
||
last_crawl_time = current_time
|
||
else:
|
||
consecutive_failures += 1
|
||
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取失败 (连续失败 {consecutive_failures} 次)")
|
||
logger.error(f"第 {cycle_count} 次抓取失败 (连续失败 {consecutive_failures} 次)")
|
||
|
||
# 如果连续失败次数过多,给出警告
|
||
if consecutive_failures >= max_consecutive_failures:
|
||
print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!")
|
||
print(" 可能的原因:")
|
||
print(" 1. Cookie已失效,需要重新登录")
|
||
print(" 2. 网络连接问题")
|
||
print(" 3. API服务异常")
|
||
print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie")
|
||
print(" 程序将继续运行,等待下次抓取...\n")
|
||
logger.warning(f"连续失败 {consecutive_failures} 次,可能Cookie已失效")
|
||
|
||
# 即使失败也更新时间,避免重复失败
|
||
last_crawl_time = current_time
|
||
|
||
except Exception as e:
|
||
consecutive_failures += 1
|
||
error_msg = str(e)[:100]
|
||
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取异常: {error_msg} (连续失败 {consecutive_failures} 次)")
|
||
logger.error(f"第 {cycle_count} 次抓取异常: {e} (连续失败 {consecutive_failures} 次)", exc_info=True)
|
||
|
||
# 如果连续失败次数过多,给出警告
|
||
if consecutive_failures >= max_consecutive_failures:
|
||
print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!")
|
||
print(" 可能的原因:")
|
||
print(" 1. Cookie已失效,需要重新登录")
|
||
print(" 2. 网络连接问题")
|
||
print(" 3. 程序异常")
|
||
print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie")
|
||
print(" 程序将继续运行,等待下次抓取...\n")
|
||
logger.warning(f"连续失败 {consecutive_failures} 次,可能Cookie已失效")
|
||
|
||
# 即使异常也更新时间,避免重复异常
|
||
last_crawl_time = current_time
|
||
|
||
finally:
|
||
# 清理资源,防止内存泄漏
|
||
if service is not None:
|
||
del service
|
||
gc.collect()
|
||
|
||
# 计算下次抓取时间
|
||
check_time = datetime.now()
|
||
next_check_seconds = max(0, interval_seconds - (check_time - last_crawl_time).total_seconds())
|
||
|
||
if next_check_seconds > 0:
|
||
# 显示下次执行时间
|
||
next_time = check_time + timedelta(seconds=next_check_seconds)
|
||
print(f"[{check_time.strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')}", end='\r')
|
||
|
||
# 分段等待,每30秒更新一次显示
|
||
wait_interval = 30
|
||
waited = 0
|
||
while waited < next_check_seconds:
|
||
sleep_time = min(wait_interval, next_check_seconds - waited)
|
||
time.sleep(sleep_time)
|
||
waited += sleep_time
|
||
|
||
# 更新显示
|
||
if waited < next_check_seconds:
|
||
remaining = next_check_seconds - waited
|
||
next_time = datetime.now() + timedelta(seconds=remaining)
|
||
# 每分钟更新一次显示
|
||
if remaining % 60 < 30 or remaining < 60:
|
||
hours = int(remaining // 3600)
|
||
minutes = int((remaining % 3600) // 60)
|
||
seconds = int(remaining % 60)
|
||
time_str = f"{hours}小时{minutes}分{seconds}秒" if hours > 0 else f"{minutes}分{seconds}秒"
|
||
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')} (剩余 {time_str})", end='\r')
|
||
|
||
print() # 换行
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\n⚠️ 用户中断守护进程")
|
||
logger.info("守护进程被用户中断")
|
||
return 0
|
||
except Exception as e:
|
||
logger.error(f"守护进程执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
parser = argparse.ArgumentParser(
|
||
description='巨量百应 - MCN机构数据抓取工具',
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
示例:
|
||
python main.py login # 触发登录流程
|
||
python main.py status # 查看Cookie状态
|
||
python main.py fetch-daren # 抓取达人账号数据(单次)
|
||
python main.py fetch-daren --page-size 50 # 自定义每页数量
|
||
python main.py daemon # 守护进程模式(默认1小时抓取一次)
|
||
python main.py daemon --interval 2 # 自定义抓取间隔(2小时)
|
||
"""
|
||
)
|
||
|
||
subparsers = parser.add_subparsers(dest='command', help='可用命令')
|
||
|
||
# login命令
|
||
login_parser = subparsers.add_parser('login', help='触发登录流程')
|
||
|
||
# status命令
|
||
status_parser = subparsers.add_parser('status', help='查看Cookie状态')
|
||
|
||
# crawl命令
|
||
crawl_parser = subparsers.add_parser('crawl', help='抓取排行榜数据')
|
||
crawl_parser.add_argument('--rank-type', type=int, default=0, help='排行类型,0=总榜')
|
||
crawl_parser.add_argument('--size', type=int, default=10, help='每页数量')
|
||
crawl_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')
|
||
|
||
# 粉丝数据抓取选项(使用互斥组)
|
||
fans_group = crawl_parser.add_mutually_exclusive_group()
|
||
fans_group.add_argument('--fetch-fans', action='store_true', default=True, dest='fetch_fans', help='抓取粉丝数据(默认)')
|
||
fans_group.add_argument('--no-fetch-fans', action='store_false', dest='fetch_fans', help='不抓取粉丝数据')
|
||
|
||
# fetch-daren命令(抓取达人账号数据)
|
||
fetch_daren_parser = subparsers.add_parser('fetch-daren', help='抓取MCN机构绑定的达人账号数据')
|
||
fetch_daren_parser.add_argument('--page-size', type=int, default=20, help='每页数量(默认20)')
|
||
fetch_daren_parser.add_argument('--status', type=int, default=1, help='状态(1=全部,默认1)')
|
||
|
||
# daemon命令(循环运行模式)
|
||
daemon_parser = subparsers.add_parser('daemon', help='循环运行模式(守护进程),定时自动抓取达人账号数据')
|
||
daemon_parser.add_argument('--interval', type=float, default=None,
|
||
help=f'抓取间隔(小时,默认: {DAEMON_INTERVAL_HOURS})')
|
||
daemon_parser.add_argument('--page-size', type=int, default=20, help='每页数量(默认20)')
|
||
daemon_parser.add_argument('--status', type=int, default=1, help='状态(默认1=全部)')
|
||
|
||
args = parser.parse_args()
|
||
|
||
if not args.command:
|
||
parser.print_help()
|
||
return 1
|
||
|
||
try:
|
||
if args.command == 'login':
|
||
return login_command()
|
||
elif args.command == 'status':
|
||
return status_command()
|
||
elif args.command == 'crawl':
|
||
return crawl_command(args)
|
||
elif args.command == 'fetch-daren':
|
||
return fetch_daren_command(args)
|
||
elif args.command == 'daemon':
|
||
return daemon_command(args)
|
||
else:
|
||
parser.print_help()
|
||
return 1
|
||
except Exception as e:
|
||
logger.error(f"程序执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|
||
|