工作生活

fetch huaban images pin urls and

2019-07-05  本文已影响0人  狼无雨雪
"""
really used in fetching url from https://artsandculture.google.com/entity/m0bwbv?categoryid=art-movement
"""
from selenium import webdriver
import time
import os
from bs4 import BeautifulSoup
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'

# browser = webdriver.Chrome()
browser.get('http://huaban.com/search/?q=%E6%B0%B4%E5%A2%A8%E7%94%BB&type=pins')






asserts_all=set()
images_all = set()
pin_number = 0
img_number = 0
is_running = True

count = 0

pre_img_num = 0

while is_running:
    pageSource = browser.page_source

    soup = BeautifulSoup(pageSource,'lxml')
    asserts = soup.find_all('a')
    for assert_value in asserts:
        if assert_value.get("href") != None and assert_value.get("href").startswith('/pin') and assert_value.img != None and assert_value.img.get("src") != None:
            asserts_all.add(assert_value.get("href"))
            images_all.add(assert_value.img.get("src"))
    browser.execute_script("window.scrollBy(0,1000)")
    
    pin_number = len(asserts_all)
    print("pin numbers:", pin_number)
    
            
    img_number = len(images_all)
    
    print("img number", img_number)
    
    time.sleep(1)
    
    if pre_img_num == img_number:
        count += 1
    else:
        count = 0
    pre_img_num = img_number
    if count == 100:
        is_running = False
        
        
with open("huaban_pin_asserts_all.txt",'w',encoding="utf8") as write_file:
    for line in asserts_all:
        write_file.write(str(line)+"\n")

        
with open("huaban_img_asserts_all.txt",'w',encoding="utf8") as write_file:
    for line in images_all:
        write_file.write(str(line)+"\n")
# browser.close()

url:http://huaban.com/search/?q=%E6%B0%B4%E5%A2%A8%E7%94%BB&type=pins


1540285217320.png
上一篇 下一篇

猜你喜欢

热点阅读