有来医生

2022-06-15  本文已影响0人  Lonelyroots
"""

精神心理科:
    "https://youlai.cn/dise/pk_9_0_1.html"

"""
from requests_html import HTMLSession
import csv
import re

class Spider:
    def __init__(self):
        self.session = HTMLSession()
        self.level1_url = "https://youlai.cn/dise/pk_9_0_1.html"  # 一级疾病
        self.HomeUrl = "https://youlai.cn"     # 主页

    def parseLevel1(self):
        """解析一级中所有的疾病分类"""
        response = self.session.get(url=self.level1_url)
        level2_urlTitle = response.html.xpath('//dl[@class="textList"]//a/text()')
        level2_urlList = response.html.xpath('//dl[@class="textList"]//a/@href')
        for index in range(0,len(level2_urlList)):
            self.parseArticleHome(level2_urlList[index],level2_urlTitle[index])
            # break

    def parseArticleHome(self,url,title):
        """解析二级中相关文章页数"""
        articleHomeUrl = (self.HomeUrl + url).replace('dise/','dise/articlelist/').replace('.html','_%s.html')
        page = 1
        isPage = True
        TrailerPage = 1
        while isPage:
            if page > TrailerPage:      # 如果超过最大页数
                break
            response = self.session.get(url = articleHomeUrl % page)
            if TrailerPage == 1:
                TrailerPage = int(response.html.xpath('//div[@id="pages"]//li[last()-1]/a/text()')[0])      # 获取最后一页数目
                # print(TrailerPage)
            articleList = self.getArticleList(response)
            self.parseArticle(articleList)
            page += 1
            # break

    def parseArticle(self,articleList):
        """解析文章中的详细信息"""
        for url in articleList:
            response = self.session.get(url=self.HomeUrl + url)
            title = response.html.xpath('//h3[@class="v_title"]/text()')[0]     # 标题
            createTime = response.html.xpath('//span[@class="fl_left time"]/text()')[0]     # 创建时间
            readingQuantity = response.html.xpath('//span[@class="fl_left num"]/text()')[0].replace('阅读:','')       # 阅读量
            doctorName = response.html.xpath('//dl[contains(@class,"doc_pic_box")]/dd//li/strong/text()')       # 医生名
            hospitalName = response.html.xpath('//dl[contains(@class,"doc_pic_box")]/dd//p[1]/text()')      # 医院名
            officeName = response.html.xpath('//dl[contains(@class,"doc_pic_box")]/dd//p[2]/text()')        # 办公室名
            content = re.findall('<div class="text">(.*?)</div>',response.text,re.S)[0].strip()     # 文本内容
            # (a:附加写方式打开,不可读;a+:附加读写方式打开)
            with open('内科文章.csv','a+',encoding='utf-8',newline='') as fp:
                writer = csv.writer(fp)
                writer.writerow((title,createTime,readingQuantity,doctorName,hospitalName,officeName,content))
            print("******",title,createTime,readingQuantity,doctorName,hospitalName,officeName,"******")
            print(content)
            # break

    def getArticleList(self,response):
        """获取相关文章的URL 列表链接"""
        return response.html.xpath('//ul[@class="article_left article_l_list bd_none"]//h3/a/@href')

    def run(self):
        self.parseLevel1()

if __name__ == '__main__':
    spider = Spider()
    headers = ('文章标题','发表时间','阅读量','医生名','院名','科室','正文')
    with open('内科文章.csv','w',encoding='utf-8',newline='') as fp:
        writer = csv.writer(fp)
        writer.writerow(headers)
    spider.run()

"""

精神分裂症:
    https://youlai.cn/dise/480.html
    https://youlai.cn/dise/articlelist/480_1.html
自闭症:
    https://youlai.cn/dise/481.html
    https://youlai.cn/dise/articlelist/481_1.html

"""

文章到这里就结束了!希望大家能多多支持Python(系列)!六个月带大家学会Python,私聊我,可以问关于本文章的问题!以后每天都会发布新的文章,喜欢的点点关注!一个陪伴你学习Python的新青年!不管多忙都会更新下去,一起加油!

Editor:Lonelyroots

上一篇 下一篇

猜你喜欢

热点阅读