python爬取邮箱

上次纠结了那么久。结果不用编码也是可以匹配邮箱的。

下面是一个用队列实现,广度优先的简单爬虫代码。先就这样吧,目测暂时不会再理它了,以后有时间再修改。(又是一个烂尾。。。。。)

 

View Code
  1 # -*- coding: cp936 -*-

  2  import urllib2

  3  import re

  4  from pyquery import PyQuery as pq

  5  from lxml import etree

  6   

  7  #mailpattern = re.compile('[^\._:>\\-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')

  8  mailpattern = re.compile('[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')

  9  

 10  htmlcount = 0  #to count the urls

 11  maxcount = 3000 # the max count

 12  allUrls = set()

 13  allMails = set()

 14  UrlsQlist = []

 15  UrlsQdict = {}

 16  url = "http://www.163.com"

 17  fmails = open("E:/py/crawler/mailresult.txt","a")

 18  furls = open("E:/py/crawler/urlresult.txt","a")

 19  

 20  

 21   

 22   

 23  def geturls(data):#the function to get the urls in the html

 24      urls = set()

 25      if data:  

 26          d = pq(data)

 27          label_a = d.find('a')#用pyquery库去找到 a 标签.

 28          if label_a:

 29              label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))

 30              for u in label_a_href:

 31                  if u[0:10]!="javascript" :  

 32                      if u[0:4] == "http":

 33                          urls.add(u)

 34                      else:

 35                          urls.add(url + u)              

 36              #for u in urls:

 37                  #print u

 38          return urls

 39      else:

 40          return None

 41          

 42  def gethtml(url):

 43      try:

 44          fp = urllib2.urlopen(url)

 45      except:

 46          print "urllib2.urlopen error"

 47          return None

 48      else:

 49          mybytes =fp.read()

 50          fp.close()

 51          return mybytes

 52      

 53  def savemails(data): # the function to save the emails

 54      if data:

 55          mailResult = mailpattern.findall(data)

 56          mailResultset = set(mailResult)

 57          if mailResultset:

 58              allMails.update(mailResultset)

 59          

 60  def savehtml(pagecontent,count):

 61      if pagecontent != None:

 62          f = open("E:/py/crawler/html/"+str(count)+".html","w")

 63          f.write(pagecontent)

 64          f.close()

 65      else:

 66          f = open("E:/py/crawler/html/"+str(count)+"error"+".html","w")

 67          f.write("this page empty")

 68          f.close()

 69          

 70  def BFS(firstUrl):

 71      global htmlcount

 72      global maxcount

 73      allUrls.add(firstUrl)

 74      UrlsQlist = list(allUrls)

 75      while htmlcount < maxcount : #数量小于最大值

 76          tempUrl = UrlsQlist.pop(0)# the queue

 77          myWebStr = gethtml(tempUrl)

 78          savehtml(myWebStr,htmlcount)

 79          savemails(myWebStr)

 80          firstUrls_set = geturls(myWebStr)#初始页面的处理

 81          if firstUrls_set != None:

 82              allUrls.update(firstUrls_set) #记录全部 url

 83              for u in firstUrls_set:

 84                  if u not in UrlsQlist:

 85                      UrlsQlist.append(u)       

 86          htmlcount = htmlcount + 1

 87          

 88          

 89  BFS(url)

 90  for u in allMails:

 91      try:

 92          fmails.write(u)

 93          fmails.write('\n')

 94      except:

 95          continue

 96  for u in allUrls:

 97      try:

 98          furls.write(u)

 99          furls.write('\n')

100      except:

101          continue

102  fmails.close()

103  furls.close()

 


2013.5.13 update

本来想在加个多线程。。。。结果看了 好多资料 无处下手,再研究研究 ,日后再改

加了点 url规范化。代码整理如下:

  1 import urllib2

  2 import re

  3 from pyquery import PyQuery as pq

  4 from lxml import etree

  5 import urlparse

  6 import time

  7 

  8 allUrls = set()

  9 allMails = set()

 10 urlsDownlist = []

 11 

 12 class mailCrawler:

 13     def __init__(self,mailExpression,start_url,maxcount):   

 14         ''' mailExpressoin 邮箱的正则表达式;

 15         start_url开始邮箱;

 16         maxcount最大数量'''

 17         self.mailpattern = re.compile(mailExpression)

 18         self.maxcount = maxcount

 19         self.htmlcount = 0

 20         self.UrlsQlist = []#url queue 实现广度优先

 21         self.url = start_url

 22 

 23     

 24     def url_normal(self,url):

 25         '''url 规范化 '''

 26         scheme,netloc,path,query = urlparse.urlsplit(url)[:4]

 27         netloc = netloc.lower()

 28 

 29         url.encode("utf-8")

 30 

 31         if path:

 32             path = re.sub('/{2,}','/',path)#去除url中的重复/

 33             path = re.sub(r'\.$','',path)#去除url中结尾多余的点

 34             path = re.sub('/$','',path)#去除url中结尾多余的/

 35             path = re.sub('\s','',path)#取出url中的空格

 36         if query:

 37             return '%s://%s%s?%s' % (scheme,netloc,path or '/',query)

 38         else:

 39             return '%s://%s%s' % (scheme,netloc,path)

 40 

 41     def geturls(self,data):

 42         '''解析html中的url'''

 43         urls = set()

 44         if data:  

 45             d = pq(data)

 46             label_a = d.find('a')#用pyquery库去找到 a 标签.

 47             if label_a:

 48                 label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))

 49                 for u in label_a_href:

 50                     if u[0:10]!="javascript" and u[0:6]!="mailto" :  

 51                         if u[0:4] == "http":

 52                             normal_url = self.url_normal(u)

 53                             urls.add(normal_url)

 54                         else:

 55                             normal_url = self.url_normal(self.url + u)

 56                             urls.add(normal_url)              

 57             return urls

 58         else:

 59             return None

 60         

 61     def gethtml(self,url):

 62         '''下载html  5s超时'''

 63         try:

 64             fp = urllib2.urlopen(url,None,5)

 65         except:

 66             print "urllib2.urlopen error  or timeout"

 67             return None

 68         else:

 69             mybytes =fp.read()

 70             fp.close()

 71             return mybytes

 72         

 73     def savemails(self,data):

 74         '''将抓取到的url存放到 allmails中 ,set去重复'''

 75         global allMails

 76         if data:

 77             mailResult = self.mailpattern.findall(data)

 78             mailResultset = set(mailResult)

 79             if mailResultset:

 80                 allMails.update(mailResultset)

 81             

 82     def savehtml(self,pagecontent,htmlcount,url):

 83         '''保存html文件 '''

 84         if pagecontent != None:

 85             f = open("E:/py/crawler/html/"+str(htmlcount)+".html","w")

 86             f.write(pagecontent)

 87             f.close()

 88         else:

 89             f = open("E:/py/crawler/html/"+str(htmlcount)+"error"+".html","w")

 90             try:

 91                 f.write(url)

 92             except:

 93                 f.write("encode error")

 94             f.close()

 95             

 96     def BFS(self):

 97         '''用队列实现广度优先,爬取url '''

 98         global allUrls

 99         global urlsDownlist

100         allUrls.add(self.url)

101         self.UrlsQlist = list(allUrls)

102         while self.htmlcount < self.maxcount : #数量小于最大值

103             tempUrl = self.UrlsQlist.pop(0)# the queue

104             print tempUrl

105             urlsDownlist.append(tempUrl)

106             myWebStr = self.gethtml(tempUrl)

107             self.savehtml(myWebStr,self.htmlcount,tempUrl)

108             self.savemails(myWebStr)

109             firstUrls_set = self.geturls(myWebStr)#初始页面的处理

110             if firstUrls_set != None:

111                 for u in firstUrls_set:

112                     if u not in allUrls:

113                         allUrls.add(u)

114                         self.UrlsQlist.append(u)       

115             self.htmlcount = self.htmlcount + 1

116             

117 

118 def main():

119     reg = r'[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+'

120     url = "http://www.baidu.com"

121     count = 100

122     fmails = open("E:/py/crawler/mailresult.txt","a")

123     furls = open("E:/py/crawler/urlresult.txt","a")

124     fdownUrls = open("E:/py/crawler/urlDownresult.txt","a")

125     newcrawler = mailCrawler(reg,url,count)

126     newcrawler.BFS()

127     for u in allMails:

128         try:

129             fmails.write(u)

130             fmails.write('\n')

131         except:

132             continue

133     for u in allUrls:

134         try:

135             furls.write(u)

136             furls.write('\n')

137         except:

138             continue

139     for u in urlsDownlist:

140         try:

141             fdownUrls.write(u)

142             fdownUrls.write('\n')

143         except:

144             continue

145     fmails.close()

146     furls.close()

147     fdownUrls.close()

148 

149 if __name__ == '__main__':

150     main()

 

 

你可能感兴趣的:(python)