【Python爬虫】第十五次作业
2017-09-03 本文已影响10人
Mango907
import requests
from lxml import etree
url='http://www.ygdy8.com'
req = requests.get(url)
req.encoding='gb2312'
html=req.text
selector=etree.HTML(html)
infos=selector.xpath('//div[@class="contain"][1]/ul/li[position()<10]/a')
url_list=[]
for info in infos:
a_text=info.xpath('text()')
a_href=info.xpath('@href')
if len(a_text)==0 or a_text[0]=='经典影片':
pass
else:
menu_url=url+a_href[0]
print(a_text[0],menu_url)
req2 = requests.get(menu_url)
req2.encoding='gb2312'
html2=req2.text
# print(html2)
selector2=etree.HTML(html2)
page_total=selector2.xpath('//div[@class="co_content8"]/div[@class="x"]//text()')[1].split('/')[0].replace('共','').replace('页','')
print(page_total)
list_id=selector2.xpath('//div[@class="co_content8"]/div[@class="x"]//a/@href')[0].replace('2.html','')
print(list_id)
for i in range(1,int(page_total)+1):
right_url=list_id+str(i)
# print(right_url)
page_url=menu_url.replace('index',right_url)
# print(page_url)
url_list.append(page_url)
print(len(url_list))