爬虫代码

声明：刚刚的帖子忘记删__client_id了，因此言论“你不配写爬虫”不是我发的（~~被盗号了~~）

问题：现在只能做到3s/个，求加速优化方法

PYTHON

from playwright.sync_api import sync_playwright

# 请替换成你自己的 Cookie
cookie_str = "_uid=1494460; __client_id=******; C3VK=0f6f42"

def crawler_with_browser(url, team_id):
    """
    爬取洛谷团队页面，获取团队名称和编号
    :param url: 团队页面URL
    :param team_id: 团队编号
    :return: 包含团队编号和名称的字典，失败则返回None
    """
    with sync_playwright() as p:
        browser = p.chromium.launch(timeout=10000)
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            extra_http_headers={
                "Cookie": cookie_str,
                "Referer": "https://www.luogu.com.cn/",
                "x-requested-with": "XMLHttpRequest"
            }
        )
        page = context.new_page()
        
        try:
            page.goto(url, timeout=10000, wait_until="load")
            # 等待页面加载完成（调整为合理的等待时间，10ms太短）
            page.wait_for_timeout(1000)
            
            # 获取团队名称
            h1 = page.query_selector('h1.lfe-h1[data-v-f265fec6]')
            if h1:
                team_name = h1.inner_text().strip()
                # 同时输出团队编号和名称
                result = {
                    "team_id": team_id,
                    "team_name": team_name
                }
                print(f"团队编号: {team_id}, 团队名称: {team_name}")
                return result
            else:
                print(f"团队编号 {team_id}: 未找到团队名称")
                return None
        
        except Exception as e:
            print(f"爬取团队编号 {team_id} 时出错: {str(e)}")
            return None
        
        finally:
            browser.close()

if __name__ == "__main__":
    base_url = "https://www.luogu.com.cn/team/"
    # 起始团队编号
    start_id = 1000
    # 先爬取起始编号
    crawler_with_browser(base_url + str(start_id), start_id)
    
    # 循环爬取下一个编号（建议添加终止条件，避免无限循环）
    while True:
        start_id += 1
        # 可选：添加终止条件，比如爬取到1010就停止
        if start_id > 122455:
            print("已爬取指定范围的团队信息，程序结束")
            break
        crawler_with_browser(base_url + str(start_id), start_id)

讨论操作

回复

相关推荐

爬虫代码