多线程爬取CSDN博客 并使用代理

代码如下:

import re
import urllib.error
import urllib.request
import threading
import time
import random

def use_proxy(url,proxy_addr):
	#建立请求头并设为全局变量opener
    proxy=urllib.request.ProxyHandler({"http":proxy_addr})
    opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    
    #模拟浏览器的操作
    headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
    request = urllib.request.Request(url=url, headers=headers)
	
	#获取返回信息
    data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    pat = re.compile(r'
(.*?)
', re.S) data = pat.findall(data) return data class Onr(threading.Thread):#新建Onr线程类 def __init__(self): threading.Thread.__init__(self) def run(self): #奇数页爬取 for i in range(1,20,2): try: #构建页面的url url="https://so.csdn.net/so/search/s.do?p="+str(i)+"&q=ios&t=blog&domain=&o=&s=&u=&l=&rbg=0" # pagedata=urllib.request.urlopen(url).read().decode() data=use_proxy(url,"1.198.72.44:9999") print("第"+str(i)+"页段子是:"+str(data)) span = round(random.random() * 6, 1) time.sleep(span) except urllib.error.URLError as e: #*except urllib2.HTTPError, e: # print e.code #except urllib2.URLError, e: # print e.reason* #下面两个if作用和上面注释的代码作用相同,判断错误属性 if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) time.sleep(2) except Exception as e: print("exception" + str(e)) time.sleep(1) class Two(threading.Thread):#新建Two 线程类 def __init__(self): threading.Thread.__init__(self) def run(self): #访问偶数页 for i in range(2,20,2): try: #构建页面的url url = "https://so.csdn.net/so/search/s.do?p=" + str(i) + "&q=ios&t=blog&domain=&o=&s=&u=&l=&rbg=0" # pagedata=urllib.request.urlopen(url).read().decode() data = use_proxy(url, "1.198.72.44:9999") print("第" + str(i) + "页段子是:" + str(data)) span = round(random.random() * 6, 1) time.sleep(span) except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) time.sleep(2) except Exception as e: print("exception" + str(e)) time.sleep(1) #建立线程 one=Onr() two=Two() #开始线程 one.start() two.start()

你可能感兴趣的:(爬虫/html)