python爬虫
2017-11-30 本文已影响0人
这个太难了
1、爬取糗事百科
代码:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
time: 2017 / 10 / 23
import requests
import bs4
from bs4 import BeautifulSoup
def getHtmlText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return" "
def readJokeText(html,fpath):
soup=BeautifulSoup(html,'html.parser')
for spans in soup.find_all('div',attrs={'class':'content'}):
if isinstance(spans, bs4.element.Tag):
sp = spans.find('span')
# 如果用sp.string打印会出现有None,用sp.getText()方法打印就没有了
print(sp.getText(),end=' ')
with open(fpath,'a',encoding='utf-8') as f:
f.write(sp.getText()+'\n')
f.close()
def main():
depth = 2
path = "F://Joke.txt"
start_url="https://www.qiushibaike.com/text/page/"
for i in range(1,depth+1):
url = start_url + str(i)
html=getHtmlText(url)
readJokeText(html, path)
main()
2、爬取淘宝手机信息
代码:
import requests
import re
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
tlst = re.findall(r'\"title\":\".*?\"', html)
mplst = re.findall(r'\"price\":\"[\d]*\"', html)
mslst = re.findall(r'\"month_sales\":\"[\d]*\"', html)
for j in range(len(mslst)):
title = eval(tlst[j].split(':')[1])
mprice = eval(mplst[j].split(':')[1])
msale = eval(mslst[j].split(':')[1])
ilt.append([title, mprice, msale])
except:
print(" ")
def printGoods(ilt):
count = 0
tplt = "{:4}\t{:20}\t{:20}\t{:20}"
print(tplt.format("序号", "商品名称", "价格", "付款人数"))
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1], g[2]))
def main():
depth = 2
info = []
goods = "手机"
start_url = "https://s.taobao.com/search?q="+goods
for i in range(depth):
try:
url = start_url+"&s="+str(48*i)
html = getHTMLText(url)
parsePage(info,html)
except:
continue
printGoods(info)
main()
3、爬取中国大学排名
代码:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
time: 2017 / 10 / 22
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
#获取url的html信息并返回
try:
r=requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist, html):
#将html的信息放到列表ulist中
soup=BeautifulSoup(html,'html.parser')
#找出tbody标签
for tr in soup.find('tbody').children:
# 检测tr标签的类型,如果不是bs4.element.Tag定义的标签则过滤掉
if isinstance(tr,bs4.element.Tag):
#查找td标签
tds=tr('td')
ulist.append([tds[0].string,tds[1].string,tds[2].string,tds[3].string])
def printUnivList(ulist, num):
tplt="{0:^8}\t{1:{4}^10}\t{2:{4}^8}\t{3:{4}^6}"
print(tplt.format("排名","学校名称","省市","总分",chr(12288)))
for i in range(num):
u=ulist[i]
print(tplt.format(u[0],u[1],u[2],u[3],chr(12288)))
def main():
uinfo=[]
url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
html=getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo,10) #打印10所大学信息
main()
4、爬取豆瓣top250
代码:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import re
import bs4
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def paserHTML(ulist,html,fpath):
uinfo = []
soup = BeautifulSoup(html,'html.parser')
try:
# title = re.findall(r'<span class="title">(.*?)</span>',html)
name = re.findall(r'<span class="title">(.[^&]*?)</span>',html)
score = re.findall(r'.*?"v:average">(.*?)</span>',html)
data = soup.find('ol',{'class': 'grid_view'})
mlist = data.find_all('li')
for m in mlist:
info = m.find('p').getText()
dioc= re.findall(r'导演:\s(.*?)\s', info)[0]
p = re.findall(r'<span>([\d].*?)</span>',html)
zlist = re.findall(r'主演:\s(.*?)\s', html)
brief = re.findall(r'<span class="inq">(.*?)</span>',html)
uinfo.append(dioc)
for i in range(len(name)):
ulist.append([name[i],score[i],uinfo[i],zlist[i],p[i],brief[i]])
with open(fpath,'a',encoding='utf-8')as f:
f.write(str(ulist))
except:
print("")
def printDouban(ulist):
count = 0
tplt = "{:4}\t{:16}\t{:10}\t{:20}\t{:20}\t{:20}\t{:20}"
print(tplt.format("排名","电影名称","评分","导演","主演","评价","简介",chr(12288)))
for t in ulist:
count = count + 1
print(tplt.format(count,t[0],t[1],t[2],t[3],t[4],t[5],chr(12288)))
def main():
path = "F://DouBan.txt"
tinfo = []
url = 'https://movie.douban.com/top250'
html = getHTMLText(url)
paserHTML(tinfo,html,path)
printDouban(tinfo)
main()