urllib-and-requests
代理
# codding: utf-8
# python 3.6+
import traceback
import urllib
import urllib.request
import urllib.parse
import time
from urllib import request
import requests
proxy_ips = [
'220.179.7.249:8080',
]
class ProxyManager(object):
@staticmethod
def get_proxy_ip():
if len(proxy_ips) == 0:
# 下载代理ip
ips = ''
# ips = str(HtmlDownloader.download('http://api.xicidaili.com/free2016.txt', use_proxy=False), encoding='utf-8')
# ips = str(HtmlDownloader.download('http://tvp.daxiangdaili.com/ip/?tid=555709422560593&num=10', use_proxy=False), encoding='utf-8')
for ip in ips.split():
proxy_ips.append(ip)
if len(proxy_ips) != 0:
ip = proxy_ips.pop(0)
proxy_ips.append(ip)
return ip
else:
return None
@staticmethod
def remove_ip(ip):
try:
proxy_ips.remove(ip)
except:
pass
class HtmlDownloader(object):
@staticmethod
def download(url, data=None, delay=None, use_proxy=True):
if use_proxy:
for _i in range(10):
_proxy_ip = ProxyManager.get_proxy_ip()
try:
print('使用代理ip:', _proxy_ip)
return HtmlDownloader.download_solve(url, data, delay, _proxy_ip)
except Exception as e:
print('下载出错:', e)
if use_proxy:
print('尝试更换代理')
ProxyManager.remove_ip(_proxy_ip)
else:
return HtmlDownloader.download_solve(url, data, delay, None)
@staticmethod
def download_solve(url, data=None, delay=None, proxy_ip=None):
"""
下载页面
:param proxy_ip:
:param url:
:param data:
:param delay: 下载延迟, 单位: 秒
:return:
"""
if url is None:
return None
if delay:
try:
print('sleep ', delay, ' seconds')
delay = int(delay)
time.sleep(delay)
print('sleep over')
except Exception as e:
print(e)
traceback.print_exc()
# if data is not None:
# data = urllib.parse.urlencode(data).encode(encoding="utf-8")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/31.0.1650.63 Safari/537.36'}
if proxy_ip:
# proxy_handler = urllib.request.ProxyHandler({'http': proxy_ip})
# req = urllib.request.Request(url, data, headers, proxy_handler)
# res = urllib.request.urlopen(req, timeout=15)
# print("使用代理IP下载页面:", proxy_ip)
# proxy_handler = urllib.request.ProxyHandler({'http': proxy_ip})
# # proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
# # proxy_auth_handler.add_password('realm', '123.123.2123.123', 'user', 'password')
# opener = urllib.request.build_opener(proxy_handler)
# request.install_opener(opener)
# res = opener.open(url, data=data)
# proxy_handler = urllib.request.ProxyHandler({'http': 'http://' + proxy_ip + '/'})
# opener = urllib.request.build_opener(proxy_handler)
# res = opener.open(url, data=data)
req = requests.get(url, params=data, headers=headers, proxies={'http', proxy_ip})
return req.content
else:
return requests.get(url, params=data, headers=headers).content
# req = urllib.request.Request(url, data, headers)
# res = urllib.request.urlopen(req, timeout=15)
# if res.getcode() != 200:
# print("下载页面 %s 失败." % res.url)
# return None
# print("下载页面 %s 成功." % res.url)
# return res.read()
@staticmethod
def download_response(url, data=None, delay=None, use_proxy=False):
"""
下载页面
:param url:
:param data:
:param delay: 下载延迟, 单位: 秒
:return:
"""
if url is None:
return None
if delay:
try:
print('sleep ', delay, ' seconds')
delay = int(delay)
time.sleep(delay)
print('sleep over')
except Exception as e:
print(e)
traceback.print_exc()
if data is not None:
data = urllib.parse.urlencode(data).encode(encoding="utf-8")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/31.0.1650.63 Safari/537.36'}
if use_proxy:
proxy_handler = urllib.request.ProxyHandler({'http': ProxyManager.get_proxy_ip()})
req = urllib.request.Request(url, data, headers, proxy_handler)
else:
req = urllib.request.Request(url, data, headers)
res = urllib.request.urlopen(req, timeout=15)
if res.getcode() != 200:
print("下载页面 %s 失败." % res.url)
return None
print("下载页面 %s 成功." % res.url)
return res
if __name__ == "__main__":
print(str(HtmlDownloader.download('http://tvp.daxiangdaili.com/ip/?tid=555709422560593&num=10', use_proxy=False), encoding='utf-8'))
# for i in range(100):
# print(ProxyManager.get_proxy_ip())
# downloader = HtmlDownloader()
# print(str(downloader.download("http://toutiao.io/"), encoding='utf8'))
pass
最后更新于
这有帮助吗?