Python爬虫之正则表达式 PK Pyquery

抓取的目标网页:http://ypk.39.net/2017019/manual

主要抓取内容为药品说明书内容

下面先给出正则表达式的抓取方式:

#-*- coding:gbk -*-

import urllib2
import re
import random
import os

def ziduantiqu(page):
    pattern1 = re.compile('<div class="tab_box">(.*?)批准文号.*?<dd>(.*?)<.*?</div>',re.S)
    pattern2 = re.compile('<div class="tab_box">(.*?)<dt>(.*?)</div>',re.S)
    items1 = re.findall(pattern1,page)
    for item1 in items1:
        filename = re.sub("[^A-Z0-9]","",item1[1])
        if (filename+'.txt') in os.listdir("E:/yaopintong1/"):
            filename = filename + '_1'
        print filename
    items2 = re.findall(pattern2,page)
    for item2 in items2:
        #print item2[1]
        content = re.sub("[\s]","",item2[1])
        content = content.replace('</dt>','\n')
        content = content.replace('</strong><br/>','')
        content = content.replace('<br/>','\n')
        content = content.replace('</p>','\n')
        content = content.replace('</dd>','\n')
        content = content.replace('</dd>','\n')
        
        content = content.replace('&nbsp;','')
        dr = re.compile(r'<[^>]+>',re.S)
        dd = dr.sub('',content)
        #dd = dd.replace('\n\n',',')
        #dd = dd.replace(' ','')
        print dd
        f1 = open('E:/yaopintong1/'+filename+'.txt','a')
        f1.write(dd)
        f1.close()

def proxy():       
    proxylist = ('59.39.88.190:8080',
                '59.41.154.148:3128',
                '59.41.239.13:9797',
                '59.42.251.197:80',
                '59.42.251.214:80',
                '59.42.251.215:80',
                '59.42.251.216:80',
                '59.49.145.151:3128',
                '59.49.248.216:3128')
    ii = random.randint(0,8)
    print ii
    proxy = proxylist[ii]
    proxies = {'http': proxy}
    proxy_support = urllib2.ProxyHandler(proxies)
    opener = urllib2.build_opener(proxy_support)
    urllib2.install_opener(opener)
        
if __name__ == '__main__':
    
    file = open("E://url2.txt")
    i = 1
    while 1:
        line = file.readline().rstrip()
        if not line:
            break
        print '开始抓取第---' +str(i) + '------页内容' 
        proxy()       
        url = line + 'manual'
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'   
        headers = { 'User-Agent' : user_agent }  
        request = urllib2.Request(url,headers = headers)
        try:
            response = urllib2.urlopen(request, timeout = 30)
            page = response.read()
        except Exception, e:
            print Exception,":",e
            f1 = open('E:/url2_error.txt','a')
            f1.write(line+ '\n')
            f1.close()
        else: 
            ziduantiqu(page)
        print '第---' +str(i) + '------页内容抓取完成'
        i = i + 1



上面的程序是批量采集的,其中网页链接是从记事本文件中读取的,里面一大堆匹配、替换等操作,好恶心有没有。

下面给出pyquery模块代码

>>> from pyquery import PyQuery as pq
>>> from lxml import etree
>>> import re
>>> v_source = pq(url = 'http://ypk.39.net/2017019/manual')
>>> for data in v_source('div').filter('.tab_box'):
	for i in range(len(pq(data).find('dt'))):
		f = open('yaopin.txt','a')
		f.write(re.sub("[\s]","",pq(data).find('dt').eq(i).text().encode('utf8')))
		f.write('\n')
		f.write(pq(data).find('dd').eq(i).text().encode('utf8'))
		f.write('\n')
		f.close()
		print pq(data).find('dt').eq(i).text()
		print pq(data).find('dd').eq(i).text()



你可能感兴趣的:(爬虫,python,PyQuery)