336 lines
15 KiB
Python
336 lines
15 KiB
Python
"""
|
||
主程序入口
|
||
提供命令行接口
|
||
"""
|
||
import sys
|
||
import argparse
|
||
import time
|
||
import gc
|
||
from datetime import datetime, timedelta
|
||
from core.browser_login import BrowserLogin
|
||
from core.cookie_manager import CookieManager
|
||
from services.crawler_service import CrawlerService
|
||
from config.settings import (
|
||
DAEMON_INTERVAL_NO_FANS_HOURS,
|
||
DAEMON_INTERVAL_WITH_FANS_HOURS
|
||
)
|
||
from utils.logger import logger
|
||
|
||
|
||
def login_command():
|
||
"""触发登录流程"""
|
||
try:
|
||
browser_login = BrowserLogin()
|
||
cookies = browser_login.trigger_login()
|
||
|
||
if cookies:
|
||
print("\n✅ 登录流程完成!")
|
||
return 0
|
||
else:
|
||
print("\n❌ 登录流程失败")
|
||
return 1
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\n⚠️ 用户中断登录流程")
|
||
logger.info("用户中断登录")
|
||
return 1
|
||
except Exception as e:
|
||
logger.error(f"登录命令执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
def status_command():
|
||
"""查看Cookie状态"""
|
||
cookie_manager = CookieManager()
|
||
|
||
print("\n" + "="*60)
|
||
print("📊 Cookie状态")
|
||
print("="*60)
|
||
|
||
if cookie_manager.is_cookie_exists():
|
||
cookie_data = cookie_manager.load_cookies()
|
||
if cookie_data:
|
||
metadata = cookie_data.get("metadata", {})
|
||
cookies = cookie_data.get("cookies", [])
|
||
|
||
print(f"✅ Cookie文件存在")
|
||
print(f"📁 文件路径: {cookie_manager.cookie_file}")
|
||
print(f"📦 Cookie数量: {len(cookies)}")
|
||
print(f"👤 用户名: {metadata.get('username', 'Unknown')}")
|
||
print(f"🕐 创建时间: {metadata.get('created_at', 'Unknown')}")
|
||
print(f"🕐 最后验证: {metadata.get('last_validated', 'Unknown')}")
|
||
print(f"🔗 登录URL: {metadata.get('url', 'Unknown')}")
|
||
else:
|
||
print("❌ Cookie文件存在但加载失败")
|
||
else:
|
||
print("❌ Cookie文件不存在")
|
||
print("💡 提示: 运行 'python main.py login' 进行登录")
|
||
|
||
print("="*60 + "\n")
|
||
return 0
|
||
|
||
|
||
def crawl_command(args):
|
||
"""抓取排行榜数据"""
|
||
try:
|
||
crawler_service = CrawlerService()
|
||
|
||
# 确定是否抓取粉丝数据
|
||
fetch_fans = args.fetch_fans if hasattr(args, 'fetch_fans') else True
|
||
|
||
filepath = crawler_service.crawl(
|
||
rank_type=args.rank_type,
|
||
size=args.size,
|
||
filter_type=args.filter_type,
|
||
fetch_fans=fetch_fans
|
||
)
|
||
|
||
if filepath:
|
||
return 0
|
||
else:
|
||
return 1
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\n⚠️ 用户中断抓取流程")
|
||
logger.info("用户中断抓取")
|
||
return 1
|
||
except Exception as e:
|
||
logger.error(f"抓取命令执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
def daemon_command(args):
|
||
"""循环运行模式(守护进程)"""
|
||
try:
|
||
# 解析配置
|
||
interval_no_fans = args.interval_no_fans if args.interval_no_fans else DAEMON_INTERVAL_NO_FANS_HOURS
|
||
interval_with_fans = args.interval_with_fans if args.interval_with_fans else DAEMON_INTERVAL_WITH_FANS_HOURS
|
||
|
||
# 转换为秒
|
||
interval_no_fans_seconds = interval_no_fans * 3600
|
||
interval_with_fans_seconds = interval_with_fans * 3600
|
||
|
||
print("\n" + "="*60)
|
||
print("🔄 启动循环运行模式(守护进程)")
|
||
print(f"⏰ 不包含粉丝数据抓取间隔: {interval_no_fans} 小时")
|
||
print(f"⏰ 包含粉丝数据抓取间隔: {interval_with_fans} 小时")
|
||
print("="*60)
|
||
print("💡 提示: 按 Ctrl+C 停止运行\n")
|
||
|
||
logger.info(f"守护进程启动 - 不包含粉丝间隔: {interval_no_fans}小时, 包含粉丝间隔: {interval_with_fans}小时")
|
||
|
||
# 初始化时间追踪
|
||
last_no_fans_time = datetime.now() - timedelta(seconds=interval_no_fans_seconds) # 立即执行第一次
|
||
last_with_fans_time = datetime.now() - timedelta(seconds=interval_with_fans_seconds) # 立即执行第一次
|
||
|
||
cycle_count = 0
|
||
consecutive_failures = 0 # 连续失败次数
|
||
max_consecutive_failures = 3 # 最大连续失败次数,超过后给出警告
|
||
|
||
while True:
|
||
current_time = datetime.now()
|
||
|
||
# 检查是否需要抓取不包含粉丝的数据
|
||
time_since_no_fans = (current_time - last_no_fans_time).total_seconds()
|
||
need_no_fans = time_since_no_fans >= interval_no_fans_seconds
|
||
|
||
# 检查是否需要抓取包含粉丝的数据
|
||
time_since_with_fans = (current_time - last_with_fans_time).total_seconds()
|
||
need_with_fans = time_since_with_fans >= interval_with_fans_seconds
|
||
|
||
if need_no_fans or need_with_fans:
|
||
cycle_count += 1 # 只在执行抓取时增加计数
|
||
|
||
# 确定本次抓取是否包含粉丝
|
||
fetch_fans = need_with_fans # 如果到了包含粉丝的时间,优先抓取包含粉丝的数据
|
||
|
||
crawler_service = None
|
||
try:
|
||
# 创建新的服务实例,避免内存泄漏
|
||
crawler_service = CrawlerService()
|
||
|
||
# 使用静默模式,减少控制台输出
|
||
print(f"\n[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] 开始第 {cycle_count} 次抓取 ({'包含' if fetch_fans else '不包含'}粉丝数据)")
|
||
logger.info(f"开始第 {cycle_count} 次抓取 - 包含粉丝: {fetch_fans}")
|
||
|
||
filepath = crawler_service.crawl(
|
||
rank_type=args.rank_type,
|
||
size=args.size,
|
||
filter_type=args.filter_type,
|
||
fetch_fans=fetch_fans
|
||
)
|
||
|
||
if filepath:
|
||
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ✅ 抓取完成: {filepath.name}")
|
||
logger.info(f"第 {cycle_count} 次抓取成功: {filepath.name}")
|
||
|
||
# 重置连续失败计数
|
||
consecutive_failures = 0
|
||
|
||
# 更新最后执行时间
|
||
if fetch_fans:
|
||
last_with_fans_time = current_time
|
||
last_no_fans_time = current_time # 包含粉丝的抓取也满足不包含粉丝的需求
|
||
else:
|
||
last_no_fans_time = current_time
|
||
else:
|
||
consecutive_failures += 1
|
||
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取失败 (连续失败 {consecutive_failures} 次)")
|
||
logger.error(f"第 {cycle_count} 次抓取失败 (连续失败 {consecutive_failures} 次)")
|
||
|
||
# 如果连续失败次数过多,给出警告
|
||
if consecutive_failures >= max_consecutive_failures:
|
||
print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!")
|
||
print(" 可能的原因:")
|
||
print(" 1. Cookie已失效,需要重新登录")
|
||
print(" 2. 网络连接问题")
|
||
print(" 3. API服务异常")
|
||
print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie")
|
||
print(" 程序将继续运行,等待下次抓取...\n")
|
||
logger.warning(f"连续失败 {consecutive_failures} 次,可能Cookie已失效")
|
||
|
||
# 即使失败也更新时间,避免重复失败
|
||
if fetch_fans:
|
||
last_with_fans_time = current_time
|
||
last_no_fans_time = current_time
|
||
|
||
except Exception as e:
|
||
consecutive_failures += 1
|
||
error_msg = str(e)[:100]
|
||
print(f"[{current_time.strftime('%Y-%m-%d %H:%M:%S')}] ❌ 抓取异常: {error_msg} (连续失败 {consecutive_failures} 次)")
|
||
logger.error(f"第 {cycle_count} 次抓取异常: {e} (连续失败 {consecutive_failures} 次)", exc_info=True)
|
||
|
||
# 如果连续失败次数过多,给出警告
|
||
if consecutive_failures >= max_consecutive_failures:
|
||
print(f"\n⚠️ 警告: 已连续失败 {consecutive_failures} 次!")
|
||
print(" 可能的原因:")
|
||
print(" 1. Cookie已失效,需要重新登录")
|
||
print(" 2. 网络连接问题")
|
||
print(" 3. API服务异常")
|
||
print(f"\n 建议:运行 'python main.py login' 检查并更新Cookie")
|
||
print(" 程序将继续运行,等待下次抓取...\n")
|
||
logger.warning(f"连续失败 {consecutive_failures} 次,可能Cookie已失效")
|
||
|
||
# 即使异常也更新时间,避免重复异常
|
||
if fetch_fans:
|
||
last_with_fans_time = current_time
|
||
last_no_fans_time = current_time
|
||
|
||
finally:
|
||
# 清理资源,防止内存泄漏
|
||
if crawler_service is not None:
|
||
del crawler_service
|
||
gc.collect()
|
||
|
||
# 计算下次检查时间(取两个间隔中较小的)
|
||
check_time = datetime.now()
|
||
next_check_seconds = min(
|
||
max(0, interval_no_fans_seconds - (check_time - last_no_fans_time).total_seconds()),
|
||
max(0, interval_with_fans_seconds - (check_time - last_with_fans_time).total_seconds())
|
||
)
|
||
|
||
# 如果两个都到了,立即执行,否则等待
|
||
if next_check_seconds > 0:
|
||
# 显示下次执行时间(只在开始时显示一次,减少输出)
|
||
next_time = check_time + timedelta(seconds=next_check_seconds)
|
||
print(f"[{check_time.strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')}", end='\r')
|
||
|
||
# 分段等待,每30秒更新一次显示,避免控制台输出过多
|
||
wait_interval = 30 # 每30秒更新一次显示
|
||
waited = 0
|
||
while waited < next_check_seconds:
|
||
sleep_time = min(wait_interval, next_check_seconds - waited)
|
||
time.sleep(sleep_time)
|
||
waited += sleep_time
|
||
|
||
# 每30秒更新一次显示,或者剩余时间少于30秒时更新
|
||
if waited < next_check_seconds:
|
||
remaining = next_check_seconds - waited
|
||
next_time = datetime.now() + timedelta(seconds=remaining)
|
||
# 只在剩余时间变化较大时更新显示,减少刷新频率
|
||
if remaining % 60 < 30 or remaining < 60: # 每分钟更新一次,或剩余时间少于1分钟时频繁更新
|
||
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ⏳ 等待中... 下次抓取: {next_time.strftime('%Y-%m-%d %H:%M:%S')} ({int(remaining//60)}分{int(remaining%60)}秒)", end='\r')
|
||
|
||
print() # 换行
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n\n⚠️ 用户中断守护进程")
|
||
logger.info("守护进程被用户中断")
|
||
return 0
|
||
except Exception as e:
|
||
logger.error(f"守护进程执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
parser = argparse.ArgumentParser(
|
||
description='字节跳动直播服务平台数据抓取工具',
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
示例:
|
||
python main.py login # 触发登录流程
|
||
python main.py status # 查看Cookie状态
|
||
"""
|
||
)
|
||
|
||
subparsers = parser.add_subparsers(dest='command', help='可用命令')
|
||
|
||
# login命令
|
||
login_parser = subparsers.add_parser('login', help='触发登录流程')
|
||
|
||
# status命令
|
||
status_parser = subparsers.add_parser('status', help='查看Cookie状态')
|
||
|
||
# crawl命令
|
||
crawl_parser = subparsers.add_parser('crawl', help='抓取排行榜数据')
|
||
crawl_parser.add_argument('--rank-type', type=int, default=0, help='排行类型,0=总榜')
|
||
crawl_parser.add_argument('--size', type=int, default=10, help='每页数量')
|
||
crawl_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')
|
||
|
||
# 粉丝数据抓取选项(使用互斥组)
|
||
fans_group = crawl_parser.add_mutually_exclusive_group()
|
||
fans_group.add_argument('--fetch-fans', action='store_true', default=True, dest='fetch_fans', help='抓取粉丝数据(默认)')
|
||
fans_group.add_argument('--no-fetch-fans', action='store_false', dest='fetch_fans', help='不抓取粉丝数据')
|
||
|
||
# daemon命令(循环运行模式)
|
||
daemon_parser = subparsers.add_parser('daemon', help='循环运行模式(守护进程),定时自动抓取数据')
|
||
daemon_parser.add_argument('--rank-type', type=int, default=0, help='排行类型,0=总榜')
|
||
daemon_parser.add_argument('--size', type=int, default=10, help='每页数量')
|
||
daemon_parser.add_argument('--filter-type', type=str, default='anchor', help='过滤类型')
|
||
daemon_parser.add_argument('--interval-no-fans', type=float, default=None,
|
||
help=f'不包含粉丝数据的抓取间隔(小时,默认: {DAEMON_INTERVAL_NO_FANS_HOURS})')
|
||
daemon_parser.add_argument('--interval-with-fans', type=float, default=None,
|
||
help=f'包含粉丝数据的抓取间隔(小时,默认: {DAEMON_INTERVAL_WITH_FANS_HOURS})')
|
||
|
||
args = parser.parse_args()
|
||
|
||
if not args.command:
|
||
parser.print_help()
|
||
return 1
|
||
|
||
try:
|
||
if args.command == 'login':
|
||
return login_command()
|
||
elif args.command == 'status':
|
||
return status_command()
|
||
elif args.command == 'crawl':
|
||
return crawl_command(args)
|
||
elif args.command == 'daemon':
|
||
return daemon_command(args)
|
||
else:
|
||
parser.print_help()
|
||
return 1
|
||
except Exception as e:
|
||
logger.error(f"程序执行失败: {e}", exc_info=True)
|
||
print(f"\n❌ 发生错误: {e}")
|
||
return 1
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|
||
|