爬虫-去哪儿机票查询

2019-11-12  本文已影响0人  异同

之前都是用requests+bs4+re来写爬虫,改用selenium试一下:

import pandas as pd
from lxml import etree
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from datetime import datetime


class SpiderQunar:
    def __init__(self,
                 city1='上海',
                 city2='广州',
                 date1='',
                 date2='',
                 url='https://flight.qunar.com/',
                 save_path=r'd:\Users\GMCC\Desktop\test.xlsx',
                 s_options=None,
                 exec_path="c:/users/python_package/chromedriver.exe"):
        # 未配置日期时,默认查询第二天去,第三天返回
        if date1 == '':
            today = datetime.now().date()
            date1 = str(today.replace(day=(today.day + 1)))
            date2 = str(today.replace(day=(today.day + 2)))
        self.go_df = None
        self.back_df = None
        self.df = None
        self.save_path = save_path
        self.browser = self.browser_init(exec_path, s_options)
        self.search_content = self.search_flight(url, city1, city2, date1, date2)
        self.go_df, self.back_df, self.df = self.paser(self.search_content)

    @staticmethod
    def browser_init(exec_path, s_options):
        # 检测是否存在selenium配置选项
        if isinstance(s_options, list) and len(s_options) > 0:
            options = Options()
            for each in s_options:
                options.add_argument(each)
            # options.add_argument('--headless')#无界浏览器

        return webdriver.Chrome(executable_path=exec_path)

    def search_flight(self, url, city1, city2, date1, date2):
        self.browser.get(url)
        # 出发城市
        fromcity = self.browser.find_element_by_name("fromCity")
        fromcity.clear()
        fromcity.send_keys(city1)
        # 到达城市
        tocity = self.browser.find_element_by_name("toCity")
        tocity.clear()
        tocity.send_keys(city2)
        # 去程日期
        fromdate = self.browser.find_element_by_id('fromDate')
        self.browser.execute_script("arguments[0].value=arguments[1]", fromdate, date1)
        # 点击返程日期输入框,激活往返日期选项
        self.browser.find_element_by_xpath('//div[contains(@class,"qcbox") '
                                           'and contains(@class,"qdate")'
                                           'and contains(@class,"toD")'
                                           'and contains(@class,"qcbox_disable")]').click()
        # 返程日期
        todate = self.browser.find_element_by_id('toDate')
        self.browser.execute_script("arguments[0].value=arguments[1]", todate, date2)
        # 点击搜索按钮
        sub_button = self.browser.find_element_by_xpath(
            '//button[@class="btn_search" and @data-track="key=101020008&val=国内搜索"]')
        sub_button.click()
        # 点击搜索后有时会弹出热门机场选择框,需要检查并关闭它.否则会遮挡住搜索按钮.
        try:
            self.browser.find_element_by_id("closeXI20").click()
            sub_button.click()
        finally:
            print("Redirecting..")
        # 等待页面加载完毕
        try:
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH,
                     "//div[contains(@class,'list-ct') and contains(@class,'back-list')]//p[@class='price-desc']")))
        finally:
            content = self.browser.page_source
        # 关闭浏览器窗口及浏览器驱动
        self.browser.quit()
        # self.c_service.stop()

        return content

    @staticmethod
    def _get_sche_info(content):
        left = etree.HTML(content).xpath('//div[@class="left"]')
        left = [etree.tostring(each, encoding='utf8').decode() for each in left]
        # 出发时间
        dep_time = [etree.HTML(each).xpath("//div[@class='dep']/p[@class='time']/text()")[0]
                    for each in left]
        # 到达时间
        arr_time = [etree.HTML(each).xpath("//div[@class='arr']/p[@class='time']/text()")[0]
                    for each in left]
        # 出发机场
        dep_airport = [etree.HTML(each).xpath("//div[@class='dep']/p[@class='airport']/span/text()")
                       for each in left]
        dep_airport = [''.join(each) for each in dep_airport]
        # 到达机场
        arr_airport = [etree.HTML(each).xpath("//div[@class='arr']/p[@class='airport']/span/text()")
                       for each in left]
        arr_airport = [''.join(each) for each in arr_airport]
        # 过夜航班
        cross_day = [etree.HTML(each).xpath("//div[@class='arr']/p[@class='cross-day']/text()")
                     for each in left]
        cross_day = [''.join(each) for each in cross_day]
        # 航空公司logo
        sub_info1_logo = [etree.HTML(each).xpath("//div[@class='sub-info']/img[@class='air-logo']/@src")[0]
                          for each in left]
        # 航班号
        sub_info2_flight = [etree.HTML(each).xpath("//div[@class='sub-info']/span/text()")[0]
                            for each in left]
        # 共享航班
        sub_info3_share = [etree.HTML(each).xpath("//div[@class='sub-info']/span[@class='share']/text()")
                           for each in left]
        sub_info3_share = [''.join(each) for each in sub_info3_share]
        # 飞行时长
        sub_info4_during = [etree.HTML(each).xpath("//div[@class='sub-info']/span[@class='dur']/text()")[0]
                            for each in left]

        right = etree.HTML(content).xpath('//div[@class="right"]')
        right = [etree.tostring(each, encoding='utf8').decode() for each in right]
        # 总价格
        total_p = [etree.HTML(each).xpath("//p[@class='price']//span/text()")
                   for each in right]
        total_p = [''.join(each).replace('\xa0', '') for each in total_p]
        # 汇总并返回DataFrame数据
        columns = ['起飞时间', '到达时间', '起飞机场', '到达机场',
                   '是否过夜航班', '航空公司logo', '航班号', '是否共享航班',
                   '时长', '价格']
        data = []
        for each in zip(dep_time, arr_time, dep_airport, arr_airport,
                        cross_day, sub_info1_logo, sub_info2_flight,
                        sub_info3_share, sub_info4_during, total_p):
            data.append(each)
        if 'back-list' in etree.HTML(content).xpath('//div/@class')[0]:
            df = pd.DataFrame(data=data, columns=[each + '_去程' for each in columns])

        else:
            df = pd.DataFrame(data=data, columns=[each + '_返程' for each in columns])

        return df

    def paser(self, content):
        # 去程
        go_content = etree.tostring(
            etree.HTML(content).xpath('//div[contains(@class,"list-ct") and contains(@class,"go-list")]')[0],
            encoding='utf8').decode()
        go_df = self._get_sche_info(go_content)
        # 返程
        back_content = etree.tostring(
            etree.HTML(content).xpath('//div[contains(@class,"list-ct") and contains(@class,"back-list")]')[0],
            encoding='utf8').decode()
        back_df = self._get_sche_info(back_content)
        # 全程汇总
        go_df[' '] = '|'
        df = pd.concat([go_df, back_df], axis=1)
        return go_df, back_df, df

    def save_files(self, df):
        df.to_excel(self.save_path, index=False)


spider = SpiderQunar(city1='天津', city2='哈尔滨')
spider.save_files(spider.df)

上一篇 下一篇

猜你喜欢

热点阅读