51job简历python爬虫
2018-08-20 本文已影响3人
右哼哼丨左哼哼
之前求职的时候,作为练习项目,爬了一下51job的招聘信息,为避免遗忘,现在记录一下.
爬取目标:
字段表
https://search.51job.com/list/000000,000000,0000,00,9,99,python爬虫,2,1.html
爬取字段:
先上最终爬取结果图示:
Mysql数据库图示所用到的包:
from lxml import etree
import requests
import time
import pymysql
相关元素的xpath定位:
node_list = html.xpath("//div[@class='dw_table']")
for node in node_list:
'''
Position 职位名称
Company 公司名称
Place 工作地区
Wages 薪 资
Time 发布时间
Link 详情链接
'''
Position = node.xpath("./div/p/span/a/@title")
Company = node.xpath("./div/span[@class='t2']/a/text()")
Place = node.xpath("./div[@class='el']/span[2]/text()")
Wages = node.xpath("./div[@class='el']/span[3]/text()")
Time = node.xpath("./div[@class='el']/span[4]/text()")
Link = node.xpath("./div/p/span/a/@href")
文章中使用了Mysql数据库,如果想尝试运行代码,请先创建匹配的数据表:
CREATE TABLE `51job` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`职位名称` text,
`公司名称` text,
`工作地区` text,
`薪资` text,
`发布时间` text,
`详情页` text,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;
代码全文:
from lxml import etree
import requests
import time
import pymysql
class My51job():
key = ''
page = 1
headers = {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
'Host':
'search.51job.com'
}
data = []
# 创建连接
conn = pymysql.connect(
host='localhost',
port=3306,
user='root',
passwd='你的密码',
db='test',
charset='utf8')
# 创建游标
cursor = conn.cursor()
# 执行SQL,并返回受影响行数
effect_row = cursor.execute('select * from 51job')
def downHtml(self, key, page):
url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,%s,2,%d.html' % (
key, page)
print('--------当前第%d页--------\n' % (page))
res = requests.get(url, params=self.headers)
res.encoding = 'gbk'
html = etree.HTML(res.text)
node_list = html.xpath("//div[@class='dw_table']")
for node in node_list:
'''
Position 职位名称
Company 公司名称
Place 工作地区
Wages 薪 资
Time 发布时间
Link 详情链接
'''
Position = node.xpath("./div/p/span/a/@title")
Company = node.xpath("./div/span[@class='t2']/a/text()")
Place = node.xpath("./div[@class='el']/span[2]/text()")
Wages = node.xpath("./div[@class='el']/span[3]/text()")
Time = node.xpath("./div[@class='el']/span[4]/text()")
Link = node.xpath("./div/p/span/a/@href")
for pos, com, pla, wag, tim, lin in zip(Position, Company, Place,
Wages, Time, Link):
self.data.extend([[pos, com, pla, wag, tim, lin]])
sql = "insert into 51job (职位名称, 公司名称, 工作地区, 薪资, 发布时间, 详情页) VALUES ('%s','%s','%s','%s','%s','%s')" % (pos, com, pla, wag, tim, lin)
try:
# 执行SQL
print("执行插入....")
tt = self.cursor.execute(sql)
print(tt)
self.conn.commit()
except UnicodeEncodeError as e:
print(e) # 打印错误原因
self.conn.rollback() # 回滚数据库
self.conn.close() # 关闭数据库连接
# print(self.data)
time.sleep(1)
return self.data
def writeFile(self, mylist):
with open('./51job/jobs.csv', 'w', encoding='gbk') as f:
writer = csv.writer(f)
writer.writerow(['职位名称', '公司名称', '工作地区', '薪资', '发布日期', '详情页'])
for job in mylist:
writer.writerow(job)
print("----本页写入完毕----")
if __name__ == '__main__':
my51 = My51job()
my51.key = 'python爬虫'
my51.page = 5
for i in range(1, my51.page + 1):
mylist = my51.downHtml(key=my51.key, page=i)
# my51.writeFile(mylist)
for job in mylist:
print(job)