我爱编程

selenium+PhantomJs实战Aribnb

2018-02-02  本文已影响40人  ilililillililil

一,先看结果

二,思路

三,源码


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import re

class Airbnb(object):
    def __init__(self):
        self.airbnb_urla ='https://zh.airbnb.com/s/%E5%8F%B0%E5%8C%97--%E5%8F%B0%E6%B9%BE/homes?checkin=2018-01-11&checkout=2018-01-28&allow_override%5B%5D=&s_tag=5T-o2wsE&section_offset=1'
        self.aribnb_urlb = 'https://zh.airbnb.com/?af=43896654&c=%24pi%3A9.pk%3Abaidu_brd_brandzone_demand_title_p1&src=Baidu&medium=PPC&ag_kwid=2299-36-57701246c0b98773.6a0cc0f87b49337e'

    def get_airbnb(self):
        browser = webdriver.Firefox()
        timeout = WebDriverWait(browser,10)
        browser.get(self.aribnb_urlb)

        '''
            1.airbnb爬取完成

        '''
        while True:
            #browser下滑
            browser.execute_script('window,scrollBy(0,10000)')
            time.sleep(2)
            #点击下一页
            click_a = browser.find_element_by_css_selector('li._b8vexar > a._1hjqg6h > div._1yofwd5 > div._1rltvky > svg') #_1yofwd5 1,2 #
            click_a.click()
            html = browser.page_source

            soup = BeautifulSoup(html,'lxml')
            items = soup.find('div',class_='_fhph4u')
            for a in items:
                # url = a.find('div',class_='_1fdzqn44')
                name = a.find('div',class_='_saba1yg').get_text()
                price =a.find('div',class_='_59f9ic').get_text().split(' ')[2:]
                urls = a.find('div',attrs={'role':'img'})['style']

                print('name:{}\njpgurl:{}\nprice:{}\n'.format(name,urls,price))

if __name__ == '__main__':
    a = Airbnb()
    a.get_airbnb()

上一篇下一篇

猜你喜欢

热点阅读