from multiprocessingimport Process,Queue
import time
import requests
import re, requests, bs4
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"}
start = time.time()
typewrite2 ='http://gr.xjtu.edu.cn/web/majx'
typewrite ='http://slst.xjtu.edu.cn/szdw1/jzyg.htm' # 填入一级页面网址
link = requests.get(typewrite,headers =headers,timeout =20)
link.raise_for_status()
links = bs4.BeautifulSoup(link.text,'lxml')
linkss =str(links)
#print(linkss)
#linkregx = re.compile(r'\d{4}/\d{2,6}/\w+/page\.htm')
linkregx = re.compile(r'web/\w+')
#linkregx = re.compile(r'\w+/\d+/\d+/\w+/page\.htm')
#linkregx = re.compile(r'info/\d{3,6}/\d{3,6}\.htm')
#linkregx = re.compile(r'redir\.php\?catalog_id=\d+&\;object_id=\d+')
all_links = linkregx.findall(linkss)
all_links =list(set(all_links))
print(all_links)
os ='cn'
url_20 = typewrite2.rfind(os)
# print(url_20)
url_2 = typewrite2[0:int(url_20) +3]# 每个学校都不同的地方,二级页面,老师具体信息
mailss = []
for iin range(len(all_links)):
url3 = url_2 + all_links[i]
mailss.append(url3)
print(mailss)
n =len(mailss)
class MyProcessing(Process):
def __init__(self,q):
Process.__init__(self)
self.q = q
def run(self):
print("Starting",self.pid)
while not self.q.empty():
crawler(self.q)
print("Exiting",self.pid)
mailsss = []
def crawler(q):
url = q.get(timeout=20)
try:
response = requests.get(url,timeout=20,headers = headers)
mailsource = bs4.BeautifulSoup(response.text,features="lxml")
#mailid =
mailsources =str(mailsource)
mail = re.compile(r'''(
[a-zA-Z0-9._%+-]+
@
[a-zA-Z0-9.-]+
(\.[a-zA-Z]{2,4})
)''', re.VERBOSE)
mails = mail.findall(mailsources)
mails =list(set(mails))
if len(mails) ==0:
list(mails)
else:
print(mails)
#print(mails)
#mailsss.append(mails)
except Exception as e:
print(q.qsize(),url,"Error:",e)
'''for index, column_header in enumerate(mailsss):
print(index, column_header)'''
if __name__=="__main__":
ProcessName = ["Process-1","Process-2","Process-3"]
workQueue = Queue(n)
for urlin mailss:
workQueue.put(url)
for iin range(0,3):
p = MyProcessing(workQueue)
p.daemon =True
p.start()
p.join()
end = time.time()
print("Process +Queue 多进程总爬虫时间为:",end-start)