采集华秋论坛小组信息存入csv文件
2021-06-30 本文已影响0人
是东东
import csv
import time
import requests
from lxml import etree
def write_to_file(file_path, item, n):
csv_fileds, csv_items = [], []
for i, v in item.items():
csv_fileds.append(i)
csv_items.append(v)
with open(file_path, mode='a', encoding='utf-8', newline='') as _w:
writor = csv.writer(_w)
if n == 1:
writor.writerow(csv_fileds)
writor.writerows([csv_items])
def get_group_info():
n = 1
for page in range(1, 12 + 1):
url = f'https://bbs.elecfans.com/group.php?mod=index&orderby=membernum&page={page}'
print(url)
req = requests.get(url, headers={'User-Again': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'})
time.sleep(5)
tree = etree.HTML(req.text)
details = tree.xpath('//div[@class="glist-msg"]')
for detail in details:
item = {}
item['组名'] = ''.join(detail.xpath('./h3/a/@title'))
item['id'] = ''.join(detail.xpath('./h3/a/@href')).replace('group_', '')
item['加入成员'] = ''.join(detail.xpath('./div[@class="glist-dec"]/span/text()')).replace('个成员', '')
file_path = 'group_info.csv'
write_to_file(file_path, item, n)
print(item)
print(f'采集第 {n} 个')
n += 1
if __name__ == '__main__':
get_group_info()
输出内容
![](https://img.haomeiwen.com/i13530319/a83c083c91dbdc4d.png)