python爬虫||基于jupyter工具
import requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
list=[]
for j in range(0,250,25):
url = 'https://movie.douban.com/top250?start='+str(j)+'&filter='
page = requests.get(url, headers=headers)
a = etree.HTML(page.text)
for i in range(1,26):
b = a.xpath('//*[@id="content"]/div/div[1]/ol/li['+str(i)+']/div/div[2]/div[2]/p[1]/text()[1]')
b=str(b)
print(b[b.find("导演")+4:b.find("主演")-12])
list.append(b[b.find("导演")+4:b.find("主演")-12])
print(list)
import pandas as pd
fr = pd.DataFrame(list)
#print(fr)
#print(fr.index[0])
#print(fr.iloc[0])
pinlv=fr[0].value_counts()
print(pinlv.head(20))
print(type(pinlv))