工作生活

downloading image urls from baid

2019-07-05  本文已影响0人  狼无雨雪


"""
really used in fetching url from google images
"""
import re
from selenium import webdriver
import time
import os
import sys
import re
from bs4 import BeautifulSoup
import random
from selenium.webdriver.chrome.options import Options

down_loading_urls = {
    
}


baidu_path = 'Willow_baidu'  #"wikiart"
original_url =  'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%C1%F8%CA%F7&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=111111' 


temp_path = baidu_path + "/" + "temp_baidu.txt"
path = baidu_path + "/" + "baidu.txt"



# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
if not os.path.exists(baidu_path):
    os.makedirs(baidu_path)
# option = webdriver.ChromeOptions()
# option.add_argument('--headless')
# option.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options = option)
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
browser = webdriver.Firefox(firefox_options=fireFoxOptions)

asserts_all=set()

mark_time = 0
last_value = 0

# ------------------test start------------------------

# browser.get(original_url)





now_len = 0
pre_len = 0
count_all = 0

try:
    browser.get(original_url)
#  js="var q=document.documentElement.scrollTop=100000"
#  browser.execute_script(js)
    while(True):
        time.sleep(random.randint(1,3))
        browser.execute_script("window.scrollBy(0,1000)")
#         print(browser.find_element_by_xpath('//*[@id="smb"]'))
        
        pageSource = browser.page_source
        soup = BeautifulSoup(pageSource,'lxml')
        asserts = soup.find_all('li', {"class":"imgitem"})
        for line in asserts:
#             print(data.get("ou"))
            with open(temp_path,'a',encoding="utf-8") as w_file:
                w_file.write(line.get("data-objurl") + "\n")
            asserts_all.add(line.get("data-objurl"))
        print(len(asserts_all))
        now_len = len(asserts_all)
        if now_len == pre_len:
            count_all += 1
        else:
            count_all = 0
        
        if count_all >=10:
            break
        pre_len = now_len
    
except Exception as e:
    print("global",e)
finally:
    with open(path,'w',encoding="utf8") as write_file:
        for line in asserts_all:
            write_file.write(str(line)+"\n")
#     pass
    browser.close()

    

    

上一篇下一篇

猜你喜欢

热点阅读