蚌埠学院官网综合新闻条目抓取
2018-10-16 本文已影响24人
我的袜子都是洞
蚌埠学院综合新闻
QQ图片20181016133347.png 2.png
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import json
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def page_parser(html):
soup = BeautifulSoup(html,'lxml')
for td in soup.find_all(name='td',attrs={'height':24}):
# 超链接
href = 'http://www.bbc.edu.cn' + td.find(name='td').a.attrs['href']
# 标题
title = td.find(name='td').a.font.string
# 发布时间
postTime = td.find(class_='postTime').string
yield {
'href':href,
'title':title,
'postTime':postTime
}
def get_pages(url):
html = get_one_page(url)
soup = BeautifulSoup(html,'lxml')
# 获取总页码
pages = soup.find(name='a',attrs={'title':'进入尾页'}).attrs['href']
# 将总页码提取出来
pages = pages.split('/')[8]
if pages :
return pages
return None
def write_to_file(content):
with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False) + '\n')
def main(num=0):
pages = get_pages('http://www.bbc.edu.cn/s/21/t/267/p/22/i/1/list.htm')
pages = int(pages)
if num:
pages=num
for page in range(1,pages):
url = 'http://www.bbc.edu.cn/s/21/t/267/p/22/i/'+str(page)+'/list.htm'
html = get_one_page(url)
for item in page_parser(html):
print(item)
print('抓取了: '+str(pages)+'页综合新闻')
if __name__ == '__main__':
main(20)