获取免费代理IP
~~~
import requests
from scrapy.selector import Selector
import pymysql
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="", db="fy", charset="utf8")
cursor = conn.cursor()
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
class GetRandomIp(object):
def parse(self, next_url = '/nn/1'):
response = requests.get(url = 'http://www.xicidaili.com{}'.format(next_url), headers = headers)
response = Selector(text=response.text)
tr_list = response.css('#ip_list tr')
num = len(tr_list)
if tr_list:
self.parse_detail(tr_list)
next_url = response.css('.pagination a.next_page::attr(href)').extract_first()
if next_url:
self.parse(next_url)
def parse_detail(self, tr_list):
for tr in tr_list[1:]:
ip = tr.css('td:nth-child(2)::text').extract_first()
port = tr.css('td:nth-child(3)::text').extract_first()
type = tr.css('td:nth-child(6)::text').extract_first()
speed = tr.css('td:nth-child(7) div::attr(title)').extract_first()[:-1]
if float(speed) < 1:
self.insert_sql(ip, port, type)
def insert_sql(self, ip = '', port = '', type = ''):
cursor.execute(
"insert proxy_ip(ip, port, type) VALUES('{0}', '{1}', '{2}')".format(
ip, port, type
)
)
conn.commit()
def get_ip(self):
sql = "select * from proxy_ip ORDER BY RAND() LIMIT 1"
cursor.execute(sql)
id, ip, port, type = cursor.fetchone()
conn.commit()
if not type:
type = 'http'
else:
type = type.lower()
proxy_url = '{0}://{1}:{2}'.format(type, ip, port )
res = self.check_ip(type, proxy_url)
if res:
return proxy_url
else:
self.delete_ip(id)
return self.get_ip()
def check_ip(self, type, proxy_url):
request_url = 'http://hf.58.com/ershoufang/0'
try:
proxy = {type:proxy_url}
response = requests.get(url = request_url, proxies = proxy, allow_redirects = False, timeout = 2)
except Exception as e:
print('invalid ip and port')
return False
else:
code = response.status_code
if code == 200:
return True
else:
print('invalid ip and port')
return False
def delete_ip(self, id):
sql = "delete from proxy_ip where id = {}".format(id)
cursor.execute(sql)
conn.commit()
ip = GetRandomIp()
if __name__ == '__main__':
ip = GetRandomIp()
print(ip.get_ip())
~~~
在middlewares.py文件中定义IP中间件
~~~
import ip
class EsfIpMiddleware(object):
def process_request(self, request, spider):
proxy_ip = ip.get_ip()
request.meta["proxy"] = proxy_ip
~~~
在settings.py配置文件DOWNLOADER_MIDDLEWARES中添加该中间件