python自动爬取别致数据并保存图片
2016-11-09 本文已影响86人
代码没写完休想上厕所
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import urllib
import xlrd
import xlwt
import xlutils
from xlutils.copy import copy
import sys
reload(sys)
sys.setdefaultencoding('utf8')
'''
Created on 11 9, 2016
@author: xwang
脚本使用说明:
1.同级目录建立“biezhidb.xls”
2.图片保存至/Users/xunwang/Desktop/别致爬虫/pic/
'''
baseUrl = "http://chocolateback.sinaapp.com/"
currentRow = 0;
def testXlrd(filename):
book = xlrd.open_workbook(filename)
sh = book.sheet_by_index(0)
#print sh.nrows, sh.ncols
# rows = sh.row_values(2)
return sh.nrows
def testXlwt(filename, index, data):
book = xlrd.open_workbook(filename)
sh = book.sheet_by_index(0)
wsh = copy(book)
wsh2 = wsh.get_sheet(0)
wsh2.write(currentRow, index, data)
wsh.save(filename)
def pachong():
global currentRow
for i in range(10000,14000):
try:
url = baseUrl + str(i)
print url
up = urllib2.urlopen(url)
cont = up.read()
# url
head = '<a target="_blank" href="'
tail = '">'
ph = cont.find(head)
pj = cont.find(tail, ph + 1)
url = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
if url[7:15] == "redirect":
continue
testXlwt('biezhidb.xls', 2, url)
# print cont[ph + len(head) : pj].strip()
# print cont
testXlwt('biezhidb.xls', 0, str(i))
# title
head = '<span class="title">'
tail = '</span>'
ph = cont.find(head)
pj = cont.find(tail, ph + 1)
title = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
print title
testXlwt('biezhidb.xls', 1, title)
#print cont[ph + len(head) : pj].strip()
# price
head = '<span class="price">'
tail = '</span>'
ph = cont.find(head)
pj = cont.find(tail, ph + 1)
price = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
testXlwt('biezhidb.xls', 3, price)
# print cont[ph + len(head) : pj].strip()
# picUrl
head = '<img src="h'
tail = '">'
ph = cont.find(head)
pj = cont.find(tail, ph + 1)
picUrl = unicode("h" + cont[ph + len(head) : pj].strip(), "utf-8")
testXlwt('biezhidb.xls', 4, picUrl)
storePic(picUrl, i)
#print "h" + cont[ph + len(head) : pj].strip()
currentRow = currentRow + 1
except:
continue
def storePic(url, id):
urllib.urlretrieve(url,'/Users/xunwang/Desktop/别致爬虫/pic2/biezhi_%s.jpg' % (id))
if __name__=='__main__':
currentRow = testXlrd('biezhidb.xls')
print currentRow
pachong()
print "写入完毕!"