初学python,使用python抓取某网页的信息
# -*- coding: utf-8 -*-
#引入包
importsys
importchardet
importurllib2
frombs4importBeautifulSoup
#转换编码
reload(sys)
sys.setdefaultencoding('utf-8')
#获取页面代码
req = urllib2.Request("http://gs.amac.org.cn/amac-infodisc/res/pof/manager/138.html")
#r = requests.get('http://gs.amac.org.cn/amac-infodisc/res/pof/manager/138.html');
content = urllib2.urlopen(req).read()
#转码
typeEncode = sys.getfilesystemencoding()
infoencode = chardet.detect(content).get('encoding','utf-8')
html = content.decode(infoencode,'ignore').encode(typeEncode)
#data = content.text
#使用BeautifulSoup解析代码
soup = BeautifulSoup(html,'html.parser')
#创建一个文件
f =file('d:/pythonWorkSpace/Python27PygamePy2exe-master/Python27PygamePy2exe-master/test.html',"w")
#寻找此页面内的td,class为td-content的字符并保存
for i in soup.find_all('td',class_='td-content'):
thisdata = i.text
f.write(thisdata)
#关闭文件
f.close()