python爬虫

原创博文,转载请注明出处。

单线程爬取:

所用模块urllib2,re

 1 # -*- coding: cp936 -*-

 2 import urllib2

 3 import re

 4 

 5 def main():

 6     url="http://www.baidu.com/"

 7     req = urllib2.Request(url)

 8     resp = urllib2.urlopen(req)

 9     respHtml = resp.read()

10 #<a href="/duty/" name="tj_duty">使用百度前必读</a> ,匹配内容

11     ahn='<a\s+?href="/duty/"\s+?name="tj_duty">(?P<content>.+)</a>'

12     found=re.search(ahn,respHtml)

13     print 'found=',found

14     if(found):

15         a1=found.group("content")

16         print 'content',a1

17 if __name__=='__main__':

18     main()

(?P<name>...)通过定义一个group name,在后续的匹配对象中可以把name参数传入group()而得到(?P<name>...)所匹配的内容。

Beautiful soup 是一个可以从HTML或XML文件中提取数据的Python库。官当文档 我们同样可以使用beautiful soup来爬取数据。解释器是lxml库。

 1 # -*- coding: cp936 -*-

 2 from bs4 import BeautifulSoup

 3 import urllib2

 4 import re

 5 

 6 def main():

 7     url="http://www.baidu.com/"

 8     req = urllib2.Request(url)

 9     resp = urllib2.urlopen(req)

10     respHtml = resp.read()

11     soup = BeautifulSoup(respHtml)

12     found = soup.find(href='/duty/')

13     #found = soup.find(attrs={'name':'tj_duty'})两种匹配方法  

14     print 'found:',found

15     if(found):

16         content = found.string

17         print 'content:',content

18 if __name__=='__main__':

19     main()

 想要提高爬取数据的效率,多线程的使用是必须的,下面是简单的多线程爬取 

 1 # -*- coding: cp936 -*-

 2 from Queue import Queue

 3 from threading import Thread

 4 import time

 5 import urllib2

 6 import urlparse

 7 

 8 num_threads =2

 9 q = Queue()

10 urls=['http://www.baidu.com',

11       'http://www.sina.com',

12       'http://www.qq.com',

13       ]

14 for url in urls:

15     q.put(url)

16     

17 def download(i,q):

18     while True:

19         print 'start download %s'%i

20         url = q.get()#在get()遇到了阻塞并等待 

21         parsed_url = urlparse.urlparse(url)

22         print 'Downloading: %s'%url

23         req = urllib2.Request(url)

24         resp = urllib2.urlopen(req)

25         data = resp.read()

26         filename = url.rpartition('/')[-1]

27         with open(filename+'.html','wb') as outfile:

28             outfile.write(data)

29         print 'complete download %s:%s'%(url,i)

30         q.task_done()

31 

32 for i in range(num_threads):

33     worker = Thread(target=download,args=(i,q,))

34     worker.setDaemon(True)

35     worker.start()

36 q.join()

线程池可以提高并发执行任务的效率,这里有关于线程池的介绍http://www.cnblogs.com/tracylining/p/3471594.html  现在我们将其进行应用 。

 1 from Queue import Queue

 2 from threading import Thread

 3 import datetime

 4 import urllib2

 5 import urlparse

 6 import threadpool 

 7 

 8 url_list=['http://www.baidu.com',

 9           'http://www.qq.com',

10           'http://www.sina.com',

11           ]

12 

13 def download(url):

14     try:

15         parsed_url = urlparse.urlparse(url)

16         req = urllib2.Request(url)

17         resp = urllib2.urlopen(req)

18         data = resp.read()

19         with open("download/"+str(hash(url)),'wb') as f:

20             f.write(data)

21             f.flush()

22             f.close()

23         return url,'success'

24     except Exception:

25         return url,'falied'

26     

27 def callback(url,result):

28     print '%s download is %s'%(url,result)

29     

30 def threadPoolDownload(poolsize,args):

31     start = datetime.datetime.now()

32     pool = threadpool.ThreadPool(poolsize)

33     requests = threadpool.makeRequests(download,args,callback)

34     [pool.putRequest(req) for req in requests]

35     pool.wait()

36     end = datetime.datetime.now()

37     print "Start download : ",start

38     print "End download : ",end

 实现多线程并发 还可以使用stackless 和 twisted ,关于stackless微线程,我还尚未进行学习。关于twisted的内容可以查看我的随笔。最近比较忙,代码留着以后再编吧。届时我打算将这几种并发方法进行一个比较,希望继续关注。

你可能感兴趣的:(python)