CSDN博客标题抓取
2020-07-05 本文已影响0人
成功在于实践
import json
import time
from selenium import webdriver
class CsdnBlog():
def __init__(self):
self.url ='https://www.cnblogs.com/imyalost/category/1040462.html'
self.driver =webdriver.Chrome()
self.driver.get(self.url)
self.file = open('./123.json', 'w')
def parse_data(self):
list =self.driver.find_elements_by_xpath('//*[@id="mainContent"]/div/div/div/div[1]/a')
temp ={}
for n in list:
i=n.text
j=n.get_attribute('href')
temp[i] =j
return temp
# def __del__(self):
# self.file.close()
# self.driver.close()
def save_data(self,data):
data =json.dumps(data,ensure_ascii=False)+ ',\n'
self.file.write(data)
def run(self):
self.driver.get(self.url)
data=self.parse_data()
self.save_data(data)
if __name__ == '__main__':
csdn =CsdnBlog()
csdn.run()