工作生活

processing threading fetch big h

2019-07-05  本文已影响0人  狼无雨雪
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os
from bs4 import BeautifulSoup
import random
import threading
import multiprocessing
import warnings
from multiprocessing import Lock
warnings.filterwarnings("ignore")
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'

# chrome_options = Options()
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")

images_all = set()
# browser = webdriver.Chrome(chrome_options = chrome_options)
# browser = webdriver.Chrome()
# browser = webdriver.PhantomJS(executable_path="phantomjs.exe")

origial_urls = []
original_urls_temp = []

Threads_number = 20
Processes_number = 4
num_cpu=multiprocessing.cpu_count()

# print("numbers of Threads: ",Threads_number)
# print("numbers of Processes: ",Processes_number)
# print("numbers of cpu: ",num_cpu)

with open("huaban_pin_asserts_all.txt",'r',encoding="utf8") as read_file:
        lines = read_file.readlines()
        for index, line in enumerate(lines):
            url = "http://huaban.com" + line.strip()
            original_urls_temp.append(url)
            
            if (index + 1) % Threads_number == 0 or (index + 1) == len(lines):
                origial_urls.append(original_urls_temp)
                original_urls_temp = []
# origial_urls


# def start_thread():
#     print("fuck")
def get_image_url(index,index2, url, epoch, batch, index3):
    try:
        browser = webdriver.PhantomJS()
        browser.set_page_load_timeout(10000)
        browser.set_script_timeout(10000)
        time.sleep(random.randint(1,4))
        browser.get(url)
        line = None
        try:
            img1 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/a/img')
            if img1 != None:
                images_all.add(img1.get_attribute('src'))
                line = img1.get_attribute('src')
        except Exception as e:
            pass


        try:
            img2 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/img')
            if img2 != None:
                images_all.add(img2.get_attribute('src'))
                line = img2.get_attribute('src')
        except Exception as e:
            pass
        
        try:
            lock = Lock()
            lock.acquire()
            with open("huaban_big_images_all_urls_temp.txt",'a',encoding="utf8") as write_temp_file:
                write_temp_file.write(line + "\n")
        except Exception as e:
            pass
        finally:
            lock.release()
            
        
        print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, line))
    except Exception as e:
        pass
    finally:
        browser.close()

        
def running_processing(urls, index, epoch, batch, index3):
#     print("start")
    threads = []
    print("start process %d number %d"%(batch, index3))
    for index2, url in enumerate(urls) :
        t = threading.Thread(target= get_image_url, args=(index, index2, url, epoch, batch, index3))
        threads.append(t)
    for index_i, thread in enumerate(threads):
        thread.start()
    for index_j, thread in enumerate(threads):
        thread.join()
#     print("epoch %d finished in %s"%(epoch, time.ctime()))
    

    

if __name__ == '__main__': 
    epoch = 0
    batch = 0
    len_original_urls = len(origial_urls)
    temp_urls_set = []
    for index, urls in enumerate(origial_urls):
        temp_urls_set.append(urls)
        epoch += 1
        if (index + 1) % Processes_number == 0 or (index + 1) == len_original_urls:
            batch += 1
            multiThreads = []
            for index3, urls in enumerate(temp_urls_set):
#                 print(urls)
                mt = multiprocessing.Process(target=running_processing,args=(urls, index, epoch, batch, index3))
#                 mt = multiprocessing.Process(target=start_thread)
                mt.start()
                multiThreads.append(mt)
#             for index_i, mthread in enumerate(multiThreads):
#                 mthread.start()
            for index_j, mthread in enumerate(multiThreads):
                mthread.join()
            temp_urls_set = []
#             print("end of batch: ",batch)
        
    with open("huaban_big_images_all_urls.txt",'w',encoding="utf8") as write_file:
        for line in images_all:
            write_file.write(str(line) + "\n")
#     print("images_all")
#     print(images_all)
    print('program end:%s' %time.ctime())
上一篇 下一篇

猜你喜欢

热点阅读