BeautifulSoup4爬取某社招网站数据
2017-11-19 本文已影响171人
博行天下
####### BeautifulSoup4爬取某社招网站数据,熟悉BeautifulSoup4使用,更是对上篇文章第三篇爬虫之基础BeautifulSoup4追踪和联系,欢迎小朋友们一起学习和讨论
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
class Tencent():
def __init__(self):
self.url = 'http://hr.tencent.com/position.php?&start=10#a'
def get_html(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'}
request = urllib2.Request(self.url,headers=headers)
html = urllib2.urlopen(request)
return html
def get_content(self):
techlist = []
soup = BeautifulSoup(self.get_html(),'lxml')
positionlist = soup.select('.l > a')
even = soup.select('.even')
odd = soup.select('.odd')
even + odd
for position in positionlist:
with open("position.txt",'a') as file:
file.write(position.string.encode("utf-8") + "\n")
file.close
for technology in even:
with open("technology.txt",'a') as file:
file.write("" + technology.select('td')[1].string.encode("utf-8"))
file.write(" 人数:" + technology.select('td')[2].string.encode("utf-8"))
file.write(" 地点:" + technology.select('td')[3].string.encode("utf-8"))
file.write(" 时间:" + technology.select('td')[4].string.encode("utf-8") + "\n")
file.close
for technology in odd:
with open("technology.txt",'a') as file:
file.write("" + technology.select('td')[1].string.encode("utf-8"))
file.write(" 人数:" + technology.select('td')[2].string.encode("utf-8"))
file.write(" 地点:" + technology.select('td')[3].string.encode("utf-8"))
file.write(" 时间:" + technology.select('td')[4].string.encode("utf-8") + "\n")
file.close
# items = {} 也可以这么存储数据到文件
# items["name"] = name
# str = json.dumps(items, ensure_ascii=False)
# output.write(line.encode('utf-8'))
# output.close()
if __name__ == "__main__":
tencent = Tencent()
tencent.get_content()
分享的本篇文章,如果涉及到某公司的利益,请第一时间告知,我会马上删除这篇文章,转载请注明出处,谢谢。