爬虫ip代理(转载)

转载于:https://www.jianshu.com/p/e8073fe13421html

同一个IP针对一个网站短期内大量的访问一般会致使IP被封,除了在爬取数据时增长延迟(爬取量不大或者对爬取速度没要求),还有一个好方法就是使用代理IP,这样就能够完美解决IP被封的问题。
那么,问题来了,代理的IP从哪儿来呢,土豪天然随意,直接淘宝买一些代理IP就好,稳定也不是特别贵。但对于技术爱好者,也许并无那个需求,其实网上仍是有不少免费的代理IP的,随意打开百度一搜就是,选择第一个不是广告的网站为例python

1.爬取代理商的ip

#coding=utf-8 import urllib2 from bs4 import BeautifulSoup import csv def IPspider(numpage): csvfile = file('ips.csv', 'wb') writer = csv.writer(csvfile) url = 'http://www.xicidaili.com/nn/' user_agent = 'IP' headers = {'User-agent': user_agent} for num in xrange(1, numpage + 1): ipurl = url + str(num) print 'Now downloading the ' + str(num * 100) + ' ips' request = urllib2.Request(ipurl, headers=headers) content = urllib2.urlopen(request).read() bs = BeautifulSoup(content, 'html.parser') res = bs.find_all('tr') for item in res: try: temp = [] tds = item.find_all('td') temp.append(tds[1].text.encode('utf-8')) temp.append(tds[2].text.encode('utf-8')) writer.writerow(temp) except IndexError: pass # 假设爬取前十页全部的IP和端口 IPspider(10) 

这样就爬到了1000个代理IP和端口,固然了,免费也有免费的坏处,那就是并非全部的代理IP均可以用,因此咱们须要检查一下哪些IP是可使用的。如何检查该IP是否可用,咱们就看连上代理后能不能在2秒内打开百度的页面,若是能够,则认为IP可用,添加到一个list里供后面备用,实现代码以下。api

2.验证

#coding=utf-8 import socket import urllib2 import csv def IPpool(): socket.setdefaulttimeout(2) reader=csv.reader(open('ips.csv')) IPpool=[] for row in reader: proxy=row[0]+':'+row[1] print proxy proxy_handler=urllib2.ProxyHandler({"http":proxy}) opener=urllib2.build_opener(proxy_handler) urllib2.install_opener(opener) try: html=urllib2.urlopen('http://www.baidu.com') # print html IPpool.append([row[0],row[1]]) except Exception,e: continue return IPpool IPpool() 

这样的话,就取得了一系列可用的IP代理,配合以前的爬虫使用,就不太容易出现IP被封的状况了,不过在目前这种状况下,验证IP所须要的时间过久,因此能够采用多线程或者多进程的方法来进一步提升效率。
固然,若是有更好的验证IP可用性的方法,请告诉我,我总以为用打开网站的方法很蠢,并且效率也不高。多线程

本身封装的ip池

# -*- coding:utf-8 -*- import requests import re import random class Proxies(): url = 'http://www.xicidaili.com/nn' headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.3' '6 (KHTML, like Gecko) Ubuntu Chromium/56.0.2924.7' '6 Chrome/56.0.2924.76 Safari/537.36'} tmp_proxies = [] #随机UA def get_user_agent(self): ''' 功能: 随机获取UA :return: 返回一个随机UA ''' user_agents = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] user_agent = random.choice(user_agents) return user_agent #爬ip def get_proxies(self): ''' 功能: 爬取西刺高匿IP :return: IP列表 ''' html = requests.get(url=self.url, headers=self.headers).text pattern = '(\d+\.\d+\.\d+\.\d+)</td>\s*<td>(\d+)' re_list = re.findall(pattern, html) for ip in re_list: ip = ip[0] + ':' + ip[1] self.test_proxy(ip) return self.tmp_proxies #测试ip是否可用 def test_proxy(self,ip): ''' 功能: 利用百度逐个验证IP的有效性 :param ip: 当前被验证的IP :return: ''' tar_url = "https://www.jianshu.com/p/642124e6c240" user_agent = self.get_user_agent() headers = { 'User-Agent': user_agent } proxies = { "http": ip, } try: res = requests.get(tar_url, proxies=proxies, headers=headers, timeout=5) if res.status_code == 200: self.tmp_proxies.append(ip) else: res.raise_for_status() # 若是响应状态码不是200,主动抛出异常 except requests.RequestException as e: print u"验证代理IP" + ip + u"时发生以下错误 :" print(e) #程序入口 # if __name__ == '__main__': # pro = Proxies() # proxies_list = pro.get_proxies() # print proxies_list 

使用>>>>>>>爬boss数据为例

#coding=utf-8 import requests from proxies import Proxies import time import re import random from lxml.etree import HTML class boss(): url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python%E7%88%AC%E8%99%AB&page=2&ka=page-2' def parse(self): tt = Proxies() user_agent = tt.get_user_agent() headers = { 'User-Agent': user_agent } ip = tt.get_proxies() print ip num = len(ip) num = random.randint(0, num+1) ip = ip[num] print ip proxies = { "http": ip, } try: res = requests.get(self.url, proxies=proxies, headers=headers, timeout=5) if res.status_code == 200: selector = HTML(res.content.decode('utf-8')) # content = selector.xpath('//text()') # content = ''.join(content) # print content.replace('\t', '').replace('\n', '').replace('\r', '').replace(' ', '') title = selector.xpath('//div[@class="job-list"]/ul/li/div/div[1]/h3/a/@href') for i in range(len(title)): print title[i] else: res.raise_for_status() # 若是响应状态码不是200,主动抛出异常 except requests.RequestException as e: print u"验证代理IP" + ip + u"时发生以下错误 :" print(e) if __name__ == '__main__': bo = boss() bo.parse() 

用这个即用即取,效率很高。app

做者:Py_Explorer 连接:https://www.jianshu.com/p/e8073fe13421 來源:简书 简书著做权归做者全部,任何形式的转载都请联系做者得到受权并注明出处。