爬虫-豆瓣音乐top250信息

2018-08-31  本文已影响12人  我问你瓜保熟吗

介绍:爬取豆瓣音乐TOP250的数据,练习到了了MondoDB,正则表达式,lxml

import requests
from lxml import etree
import re
import time
import pymongo

x = 0

# 连接数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client['mydb']
musictop = mydb['musictop']

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
}

# 取得每一页中25个音乐的url
def get_url_music(url):
    html = requests.get(url, headers=headers)
    selector = etree.HTML(html.text)
    music_urls = selector.xpath( '//*[@id="content"]//div//tr//a/@href')   # @href @取属性,text()取标签内容
    music_urls=list(music_urls)[0::2]

    for music_url in music_urls:
        get_url_info(music_url)
        # print(music_url)


# 提取每个音乐的详细信息
def get_url_info(music_url):
    html = requests.get(music_url)
    selector = etree.HTML(html.text)

    name = selector.xpath('//*[@id="wrapper"]/h1/span/text()')[0]
    # name = selector.xpath('//*[@id="info"]/span[1]/text()')[0]
    author = re.findall('表演者:.*?>(.*?)</a>', html.text, re.S)[0]              # .*? ?以非贪婪模式,re.S,匹配包括换行符
    style = re.findall('流派:</span>&nbsp;(.*?)<br>?', html.text, re.S)          # &nbsp 表示1个空格
    try:
        style = style[0].strip()
    except:
        style = "未知"
    pubtime = re.findall('发行时间:</span>&nbsp;(.*?)<br>?', html.text, re.S)[0].strip()
    publisher = re.findall('出版者:</span>&nbsp;(.*?)<br>?', html.text, re.S)
    if len(publisher) == 0:
        publisher = "未知"
    else:
        publisher = publisher[0].strip()

    score = selector.xpath('//*[@id="interest_sectl"]//strong/text()')[0]

    # 每首音乐的信息以字典的形式存放
    info = {
        'name': name,
        'author': author,
        'style': style,
        'time': pubtime,
        'score': score,
    }

    # 向数据库插入数据
    musictop.insert_one(info)

    global x
    x += 1
    print(x, info)


if __name__ == '__main__':
    urls = [ 'https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]  # 取得10页的rul
    for url in urls:
        get_url_music(url)
        time.sleep(0.5)
正在爬.png 已经存入到MongoDB中.png






来自:从零开始学python网络爬虫

上一篇下一篇

猜你喜欢

热点阅读