抓取的目标网页:http://ypk.39.net/2017019/manual
主要抓取内容为药品说明书内容
下面先给出正则表达式的抓取方式:
#-*- coding:gbk -*- import urllib2 import re import random import os def ziduantiqu(page): pattern1 = re.compile('<div class="tab_box">(.*?)批准文号.*?<dd>(.*?)<.*?</div>',re.S) pattern2 = re.compile('<div class="tab_box">(.*?)<dt>(.*?)</div>',re.S) items1 = re.findall(pattern1,page) for item1 in items1: filename = re.sub("[^A-Z0-9]","",item1[1]) if (filename+'.txt') in os.listdir("E:/yaopintong1/"): filename = filename + '_1' print filename items2 = re.findall(pattern2,page) for item2 in items2: #print item2[1] content = re.sub("[\s]","",item2[1]) content = content.replace('</dt>','\n') content = content.replace('</strong><br/>','') content = content.replace('<br/>','\n') content = content.replace('</p>','\n') content = content.replace('</dd>','\n') content = content.replace('</dd>','\n') content = content.replace(' ','') dr = re.compile(r'<[^>]+>',re.S) dd = dr.sub('',content) #dd = dd.replace('\n\n',',') #dd = dd.replace(' ','') print dd f1 = open('E:/yaopintong1/'+filename+'.txt','a') f1.write(dd) f1.close() def proxy(): proxylist = ('59.39.88.190:8080', '59.41.154.148:3128', '59.41.239.13:9797', '59.42.251.197:80', '59.42.251.214:80', '59.42.251.215:80', '59.42.251.216:80', '59.49.145.151:3128', '59.49.248.216:3128') ii = random.randint(0,8) print ii proxy = proxylist[ii] proxies = {'http': proxy} proxy_support = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) if __name__ == '__main__': file = open("E://url2.txt") i = 1 while 1: line = file.readline().rstrip() if not line: break print '开始抓取第---' +str(i) + '------页内容' proxy() url = line + 'manual' user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko' headers = { 'User-Agent' : user_agent } request = urllib2.Request(url,headers = headers) try: response = urllib2.urlopen(request, timeout = 30) page = response.read() except Exception, e: print Exception,":",e f1 = open('E:/url2_error.txt','a') f1.write(line+ '\n') f1.close() else: ziduantiqu(page) print '第---' +str(i) + '------页内容抓取完成' i = i + 1
下面给出pyquery模块代码
>>> from pyquery import PyQuery as pq >>> from lxml import etree >>> import re >>> v_source = pq(url = 'http://ypk.39.net/2017019/manual') >>> for data in v_source('div').filter('.tab_box'): for i in range(len(pq(data).find('dt'))): f = open('yaopin.txt','a') f.write(re.sub("[\s]","",pq(data).find('dt').eq(i).text().encode('utf8'))) f.write('\n') f.write(pq(data).find('dd').eq(i).text().encode('utf8')) f.write('\n') f.close() print pq(data).find('dt').eq(i).text() print pq(data).find('dd').eq(i).text()