selenium+PhantomJs实战Aribnb
2018-02-02 本文已影响40人
ilililillililil
一,先看结果
二,思路
三,源码
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import re
class Airbnb(object):
def __init__(self):
self.airbnb_urla ='https://zh.airbnb.com/s/%E5%8F%B0%E5%8C%97--%E5%8F%B0%E6%B9%BE/homes?checkin=2018-01-11&checkout=2018-01-28&allow_override%5B%5D=&s_tag=5T-o2wsE§ion_offset=1'
self.aribnb_urlb = 'https://zh.airbnb.com/?af=43896654&c=%24pi%3A9.pk%3Abaidu_brd_brandzone_demand_title_p1&src=Baidu&medium=PPC&ag_kwid=2299-36-57701246c0b98773.6a0cc0f87b49337e'
def get_airbnb(self):
browser = webdriver.Firefox()
timeout = WebDriverWait(browser,10)
browser.get(self.aribnb_urlb)
'''
1.airbnb爬取完成
'''
while True:
#browser下滑
browser.execute_script('window,scrollBy(0,10000)')
time.sleep(2)
#点击下一页
click_a = browser.find_element_by_css_selector('li._b8vexar > a._1hjqg6h > div._1yofwd5 > div._1rltvky > svg') #_1yofwd5 1,2 #
click_a.click()
html = browser.page_source
soup = BeautifulSoup(html,'lxml')
items = soup.find('div',class_='_fhph4u')
for a in items:
# url = a.find('div',class_='_1fdzqn44')
name = a.find('div',class_='_saba1yg').get_text()
price =a.find('div',class_='_59f9ic').get_text().split(' ')[2:]
urls = a.find('div',attrs={'role':'img'})['style']
print('name:{}\njpgurl:{}\nprice:{}\n'.format(name,urls,price))
if __name__ == '__main__':
a = Airbnb()
a.get_airbnb()