downloading video urls from yout

2019-07-05 本文已影响0人狼无雨雪


"""
really used in fetching url from google images
"""
import re
from selenium import webdriver
import time
import os
import sys
import re
from bs4 import BeautifulSoup
import random
from selenium.webdriver.chrome.options import Options

down_loading_urls = ["Aerial Freestyle Skiing",
                     "Freestyle Skiing Aerials",
                     "Freestyle Skiing Men Aerials",
                     "Freestyle Skiing Women Aerials",
                     "Freestyle Skiing - Ladies' Aerials",
                     "Men's Freestyle Skiing",
                     "Women's Freestyle Skiing",
                     "自由式滑雪空中技巧"]

if __name__ == "__main__":
    baidu_path = 'Skiing-youtube'  #"wikiart"
    


    temp_path = baidu_path + "/" + "temp_youtube.txt"
    path = baidu_path + "/" + "youtube.txt"



    # os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
    if not os.path.exists(baidu_path):
        os.makedirs(baidu_path)
    # option = webdriver.ChromeOptions()
    # option.add_argument('--headless')
    # option.add_argument('--disable-gpu')
    # browser = webdriver.Chrome(chrome_options = option)
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.set_headless()
    browser = webdriver.Firefox(firefox_options=fireFoxOptions)

    asserts_all=set()

    mark_time = 0
    last_value = 0

    # ------------------test start------------------------

    # browser.get(original_url)





    now_len = 0
    pre_len = 0
    count_all = 0

    try:
        for down_loading_url in down_loading_urls:
            print(down_loading_url)
            original_url =  'https://www.youtube.com/results?search_query='+ down_loading_url.replace(" ","+")
            browser.get(original_url)
        #  js="var q=document.documentElement.scrollTop=100000"
        #  browser.execute_script(js)
            while(True):
                time.sleep(random.randint(1,3))
                browser.execute_script("window.scrollBy(0,1000)")
        #         print(browser.find_element_by_xpath('//*[@id="smb"]'))

                pageSource = browser.page_source
                soup = BeautifulSoup(pageSource,'lxml')
                asserts = soup.find_all('a', {"id":"video-title"})
                for line in asserts:
        #             print(data.get("ou"))
                    try:
                        with open(temp_path,'a',encoding="utf-8") as w_file:
                            w_file.write("https://www.youtube.com" + line.get("href") + "\n")
                        if line.get("href") != None and line.get("href") != "":
                            asserts_all.add("https://www.youtube.com" + line.get("href"))
                    except Exception as e_t:
                        print("temp write", e_t,line)
                print(len(asserts_all))
                now_len = len(asserts_all)
                if now_len == pre_len:
                    count_all += 1
                else:
                    count_all = 0

                if count_all >=10:
                    break
                pre_len = now_len

    except Exception as e:
        print("global",e)
    finally:
        with open(path,'w',encoding="utf8") as write_file:
            for line in asserts_all:
                write_file.write(str(line)+"\n")
    #     pass
        browser.close()
downloading video urls from yout

猜你喜欢

热点阅读