IP代理池基于mongodb数据库

2017-07-22  本文已影响48人  天地清霜love橙

代码用的python2.7,抓取xici免费代理,检测放入数据库中,为以后爬虫做准备。下面直接上代码

```

#-*-encoding=utf-8-*-

importrequests

fromlxmlimportetree

importtime

importpymongo

frommultiprocessingimportPool

classGetproxy(object):

def__init__(self):

self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}

self.url ='http://www.xicidaili.com/wt/'

self.client = pymongo.MongoClient('localhost',27017)

self.xici =self.client['xici']

self.xiciipinfo =self.xici['xiciipinfo']

#self.removeip = '127.0.0.1' #第一次运行会检测该变量,因为下面只有检测失败了才会赋值

defgetip(self,num):

#爬西祠所有代理,更新放入数据库

url =self.url +str(num)

wb_data = requests.get(url,headers=self.headers)

html = etree.HTML(wb_data.text)

# htmls = etree.tostring(html)

ips = html.xpath('//tr[@class="odd"]/td[2]/text()')

ports = html.xpath('//tr[@class="odd"]/td[3]/text()')

protocols = html.xpath('//tr[@class="odd"]/td[6]/text()')

areas = html.xpath('//tr[@class="odd"]/td[4]/a/text()')

forip,port,protocol,areainzip(ips,ports,protocols,areas):

data = {

'ip': ip,

'port': port,

'protocol': protocol,

'area': area,

}

printdata

#self.xiciipinfo.insert_one(data)

#if self.removeip != ip: #此处加一个判断,如果是下面检测过的不可用的ip,就不更新进入数据库,可以节省下面的检测时间

self.xiciipinfo.update({'ip':ip},{'$set':data},True)

defcount(self,num):

foriinrange(1,num):

self.getip(i)

time.sleep(2)

defdbclose(self):

self.client.close()

defgetiplist(self):

#将数据库内数据整理放入列表

ips =self.xiciipinfo.find()

proxylist = []

foriinips:

b ="http"+"://"+ i['ip'] +":"+ i['port']

proxies = {"http": b}

# print proxies

proxylist.append(proxies)

# print proxylist

returnproxylist

defiptest(self,proxy):

#检测ip,并更新进入数据库,删掉不可用的ip

ip = proxy['http'][7:].split(':')[0]

try:

requests.get('http://wenshu.court.gov.cn/',proxies=proxy,timeout=6)

except:

print'field...............>>>>>>>>>>>>>>>>>>>>>>>>'

#self.removeip = ip #赋值给类属性

self.xiciipinfo.remove({'ip': ip})#用remove方法,将符合条件的删掉

print'remove it now.....{}'.format(ip)

else:

print'<<<<<<<<<<<<<<<<<.............success'

printproxy

if__name__ =='__main__':

pool = Pool()

proxy = Getproxy()

proxy.count(2)

iplist = proxy.getiplist()

map(proxy.iptest,iplist)

proxy.dbclose()

```

上一篇下一篇

猜你喜欢

热点阅读