king of hero

2019-11-27  本文已影响0人  等下流民

import time
import random
import re

import requests

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

BASE_URL = "https://www.wanplus.com"
EVENT_URL = "%s/event/839.html" % BASE_URL


def find_schedule_pages():
    text = requests.get(EVENT_URL).text
    hrefs = re.findall("/schedule/\d+.html", text)
    return hrefs


def chrome_get_texts(url, min_wait_seconds=1):
    print("chrome_get_texts %s" % url)
    texts = []
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')  # 解决DevToolsActivePort文件不存在的报错
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    time.sleep(min_wait_seconds + min_wait_seconds * random.random())

    for i in range(1, 10):
        try:
            driver.find_element_by_xpath('//*[@id="info"]/div[2]/div[3]/ul/li[%d]' % i).click()
            time.sleep(min_wait_seconds + min_wait_seconds * random.random())
            texts.append(driver.page_source)
        except:
            break
    return texts


def get_kings_infos(text):
    soup = BeautifulSoup(text, 'lxml')

home_team = soup.find("div", attrs={"class", "led_left"}).text.strip()
game_result = soup.find("div", attrs={"class", "las_midd"}).text.strip()
away_team = soup.find("div", attrs={"class", "sna_right"}).text.strip()

if home_team in game_result:
    game_result = home_team
elif away_team in game_result:
    game_result = away_team

home_bans = [img['alt'] for img in soup.find("div", attrs={"class", "led_left2"}).find_all('img')]
away_bans = [img['alt'] for img in soup.find("div", attrs={"class", "sna_right2"}).find_all('img')]
home_kings = [img['alt'] for img in soup.find("div", attrs={"class", "led_left1"}).find_all('img')]
away_kings = [img['alt'] for img in soup.find("div", attrs={"class", "sna_right1"}).find_all('img')]
return home_team, away_team, game_result, home_bans, away_bans, home_kings, away_kings

def main():
    with open('2019kings.csv', 'wb') as f:
        for url in find_schedule_pages():
            schedule_url = "%s%s" % (BASE_URL, url)
            texts = chrome_get_texts(schedule_url)
            for text in texts:
                home_team, away_team, game_result, home_bans, away_bans, home_kings, away_kings = get_kings_infos(text)
                items = [home_team, away_team, game_result] + home_bans + away_bans + home_kings + away_kings
                print(items)
                f.write(b'%s\n' % ','.join(items).encode("utf-8"))


main()
上一篇下一篇

猜你喜欢

热点阅读