Python requests库 ————百度新闻
2018-03-19 本文已影响0人
chliar
# -*- coding:utf-8 -*-
from email.mime.text import MIMEText
import requests,pymysql,threading,logging
import smtplib
import time,datetime
from lxml import etree
import re
class baidu_new(object):
def __init__(self):
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
}
# 手动输入搜索字段新闻
# self.input = input('输入新闻(多个用逗号隔开): ').replace(',',',')
# self.input = self.input.split(',')
# 固定字段
self.input = ['杨幂']
# 多线程爬取
def thread(self):
for data_url in self.input:
url ='http://news.baidu.com/ns?word={}&from=news&cl=2&rn=20&ct=1'.format(data_url)
t = threading.Thread(target=self.get_url,args=(url,data_url))
t.start()
# print(self.get_url())
# 请求网页
def get_url(self,url,data_url):
response = requests.get(url,headers = self.headers)
# print(response.text)
response=etree.HTML(response.content.decode('utf-8'))
node_list=response.xpath('//div[@class="result"]')
count = 0
data_list =[]
for node in node_list:
count+=1
item ={}
title=node.xpath('./h3[@class="c-title"]/a/text()')
title=str(title).replace(']','').replace('[','').replace(', ','%s'%data_url).replace("'",'')
url = node.xpath('./h3[@class="c-title"]/a/@href')[0]
start_time = node.xpath('./div[@class="c-summary c-row c-gap-top-small"]/div[@class="c-span18 c-span-last"]/p/text()|./div[@class="c-summary c-row "]/p/text()')
# start_time = str(start_time).replace(']','').replace('[','').replace("'",'').replace(r"\xa0\xa0",'')
start_time= str(start_time).replace('[','').replace(']','').replace("'",'').split(r"\xa0\xa0")
item['title']=title.replace("'",'"')
item['source']=start_time[0].replace("'",'"')
item['time']=start_time[1].replace("'",'"')
item['url']=url
# print(url,'***********************************************************')
self.get_url_two(url)
data_list.append(item)
# print(title,start_time[0],start_time[1])
if len(data_list)>0:
new_data=self.mysql(data_list)
print(new_data)
if len(new_data)>0:
self.smtp(new_data)
# print(response.content.decode())
# return response.content.decode()
# print('%s条'%count)
# 爬取新闻详细内容
def get_url_two(self,url):
response = requests.get(url,headers = self.headers)
co_dece=re.findall('gb2312',str(response.content).lower())
if len(co_dece) == 0:
co_dece = re.findall('gbk', str(response.content).lower())
if len(co_dece) == 0:
co_dece = re.findall('utf-8', str(response.content).lower())
# print(co_dece[0])
response=etree.HTML(response.content.decode('{}'.format(co_dece[0]),'ignore'))
# charset=response.xpath('//head/*/@charset|//head/*/@content')
node_list=response.xpath('//*/h1/text()|//*/p/text()')
data_list = ''
for node in node_list:
data_list +=node.replace('\n',"").replace('\t','').replace('\r','')
print(data_list)
# print(etree.tostring(response,encoding='utf-8').decode())
# return response.html
# return etree.tostring(response,encoding='utf-8').decode()
# 写入数据库(测试时要更改数据地址、账号、和密码)
def mysql(self,data_list):
new_data = []
try:
conn = pymysql.connect(host='127.0.0.1', port=3306, database='baidu_new', user='root', password='mysql',charset='utf8')
cs1 = conn.cursor()
except Exception as e:
print(e)
cs1.close()
conn.commit()
conn.close()
for item in data_list:
try:
count = cs1.execute("""select * from new where title = '%s'""" %item['title'])
except:
self.createmysql()
count = cs1.execute("""select * from new where title = '%s'""" % item['title'])
if count>0:
pass
else:
new_data.append(item)
print(item['title'])
cs1.execute("""insert into new(title,source,time) values('%s','%s','%s')"""%(item['title'],item['source'],item['time']))
cs1.close()
conn.commit()
conn.close()
return new_data
# 发送邮件,测试时设置收件人邮箱
def smtp(self,new_data):
msg_from = '2387765890@qq.com' # 发送方邮箱
passwd = 'jjqvnbtcedbzdhhi' # 填入发送方邮箱的授权码
msg_to = '847734623@qq.com' # 收件人邮箱
subject = "%s的最新新闻"%self.input[0] # 主题
content = "" # 正文
num = 0
for new in new_data:
num+=1
content +='%s、'%num + str(new['title'])+' 来源于:'+str(new['source'])+' 发布时间:'+str(new['time'])+'\n'
msg = MIMEText(content)
msg['Subject'] = subject
msg['From'] = msg_from
msg['To'] = msg_to
try:
s = smtplib.SMTP_SSL("smtp.qq.com", 465) # 邮件服务器及端口号
s.login(msg_from, passwd)
s.sendmail(msg_from, msg_to, msg.as_string())
print("发送成功")
except Exception as e:
print("发送失败")
finally:
s.quit()
# 创建数据表
def createmysql(self):
try:
conn = pymysql.connect(host='127.0.0.1', port=3306, database='baidu_new', user='root', password='mysql',
charset='utf8')
cs1 = conn.cursor()
cs1.execute("""
create table new(
title varchar(1000),
source varchar(40),
time varchar(100)
);
""")
cs1.close()
conn.commit()
except Exception as e:
print(e)
if __name__ == '__main__':
# 定时爬取
# def main(h=1, m=0):
# while True:
# now = datetime.datetime.now()
# print(now.hour, now.minute)
# if now.hour == 11 and now.minute == 40: # 设置定时发送时间
# bn = baidu_new()
# bn.thread()
# time.sleep(60) # 每隔60秒检测一次
# main()
def main(h=1, m=0):
while True:
now = datetime.datetime.now()
print(now.hour, now.minute)
bn = baidu_new()
bn.thread()
time.sleep(300) # 每隔60秒检测一次
main()
# 手动测试
# bn = baidu_new()
# bn.thread()