采集华秋论坛小组信息存入csv文件

2021-06-30  本文已影响0人  是东东
import csv
import time
import requests
from lxml import etree


def write_to_file(file_path, item, n):
    csv_fileds, csv_items = [], []
    for i, v in item.items():
        csv_fileds.append(i)
        csv_items.append(v)
    with open(file_path, mode='a', encoding='utf-8', newline='') as _w:
        writor = csv.writer(_w)
        if n == 1:
            writor.writerow(csv_fileds)
        writor.writerows([csv_items])


def get_group_info():
    n = 1
    for page in range(1, 12 + 1):
        url = f'https://bbs.elecfans.com/group.php?mod=index&orderby=membernum&page={page}'
        print(url)
        req = requests.get(url, headers={'User-Again': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'})
        time.sleep(5)
        tree = etree.HTML(req.text)
        details = tree.xpath('//div[@class="glist-msg"]')
        for detail in details:
            item = {}
            item['组名'] = ''.join(detail.xpath('./h3/a/@title'))
            item['id'] = ''.join(detail.xpath('./h3/a/@href')).replace('group_', '')
            item['加入成员'] = ''.join(detail.xpath('./div[@class="glist-dec"]/span/text()')).replace('个成员', '')
            file_path = 'group_info.csv'
            write_to_file(file_path, item, n)
            print(item)
            print(f'采集第 {n} 个')
            n += 1


if __name__ == '__main__':
    get_group_info()

输出内容

上一篇 下一篇

猜你喜欢

热点阅读