收集资料 beautiful soup - python笔记

2020-08-15  本文已影响0人  自走炮
from bs4 import BeautifulSoup # pip install beautifulsoup4
import requests
import time
import random

def run():
    page_url = "http://www7b.biglobe.ne.jp/~browneye/english/TOEIC400-1.htm"
    r = requests.get(page_url)
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, features="html.parser")

    td_list = soup.find_all("td")
    td_values = [x.text for x in td_list]
    splited_list = []
    for index in range(0, len(td_values), 4):
        word_row = td_values[index: index + 4]
        if word_row[0] == '\u3000':
            continue
        splited_list.append(word_row)

    with open("toeic_words.txt", "w") as f:
        for value in splited_list:
            f.write("{},{}\n".format(value[1], value[2]))
        print("Yes, done.")

if __name__ == "__main__":
    run()
上一篇 下一篇

猜你喜欢

热点阅读