2019-06-28

from multiprocessingimport Process,Queue

import time

import requests

import re, requests, bs4

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"}

start = time.time()

typewrite2 ='http://gr.xjtu.edu.cn/web/majx'

typewrite ='http://slst.xjtu.edu.cn/szdw1/jzyg.htm'  # 填入一级页面网址

link = requests.get(typewrite,headers =headers,timeout =20)

link.raise_for_status()

links = bs4.BeautifulSoup(link.text,'lxml')

linkss =str(links)

#print(linkss)

#linkregx = re.compile(r'\d{4}/\d{2,6}/\w+/page\.htm')

linkregx = re.compile(r'web/\w+')

#linkregx = re.compile(r'\w+/\d+/\d+/\w+/page\.htm')

#linkregx = re.compile(r'info/\d{3,6}/\d{3,6}\.htm')

#linkregx = re.compile(r'redir\.php\?catalog_id=\d+&\;object_id=\d+')

all_links = linkregx.findall(linkss)

all_links =list(set(all_links))

print(all_links)

os ='cn'

url_20 = typewrite2.rfind(os)

# print(url_20)

url_2 = typewrite2[0:int(url_20) +3]# 每个学校都不同的地方,二级页面,老师具体信息

mailss = []

for iin range(len(all_links)):

url3 = url_2 + all_links[i]

mailss.append(url3)

print(mailss)

n =len(mailss)

class MyProcessing(Process):

def __init__(self,q):

Process.__init__(self)

self.q = q

def run(self):

print("Starting",self.pid)

while not self.q.empty():

crawler(self.q)

print("Exiting",self.pid)

mailsss = []

def crawler(q):

url = q.get(timeout=20)

try:

response = requests.get(url,timeout=20,headers = headers)

mailsource = bs4.BeautifulSoup(response.text,features="lxml")

#mailid =

        mailsources =str(mailsource)

mail = re.compile(r'''(

[a-zA-Z0-9._%+-]+

@

[a-zA-Z0-9.-]+

(\.[a-zA-Z]{2,4})

                                )''', re.VERBOSE)

mails = mail.findall(mailsources)

mails =list(set(mails))

if len(mails) ==0:

list(mails)

else:

print(mails)

#print(mails)

#mailsss.append(mails)

    except Exception as e:

print(q.qsize(),url,"Error:",e)

'''for index, column_header in enumerate(mailsss):

print(index, column_header)'''

if __name__=="__main__":

ProcessName = ["Process-1","Process-2","Process-3"]

workQueue = Queue(n)

for urlin mailss:

workQueue.put(url)

for iin range(0,3):

p = MyProcessing(workQueue)

p.daemon =True

        p.start()

p.join()

end = time.time()

print("Process +Queue 多进程总爬虫时间为:",end-start)

你可能感兴趣的:(2019-06-28)