Python爬虫:爬虫demo

# -*- coding: utf-8 -*-

import urllib
import urllib2
import re

def getDetailUrl(name):
    reg = r'(.*?)'
    pattern = re.compile(reg, re.I)
    match = re.search(pattern, name)
    return match.groups()
    
#处理一页数据   
def getlist(page): 
    #获取内容表格
    tablereg = r''+'.*?'+ r'
'
pattern = re.compile(tablereg, re.I|re.S) match = re.search(pattern, page) table = match.group() #去掉注释 res = r'' pattern = re.compile(res, re.I) table = re.sub(pattern, "", table) #获取表头 theadreg = r'.*?' pattern = re.compile(theadreg, re.I|re.S) match = re.search(pattern, table) thead = match.group() reg = r'(.*?) pattern = re.compile(reg, re.I|re.S) match = re.findall(pattern, thead) head = [] for e in match: head.append(e) #获取表格内容 reg = r'(.*?)' pattern = re.compile(reg, re.I|re.S) match = re.findall(pattern, table) td = { } res = [] i = 0 while i+len(head) <= len(match): for e in head: if e == '企业名称': url_name = getDetailUrl(match[i]) td['企业名称'] = url_name[1] td['url'] = 'http://www.jnfdc.gov.cn/kfqy/' + url_name[0] td[e] = match[i] i += 1 res.append(td) td = { } return res #获取开发企业列表 rooturl = "http://www.jnfdc.gov.cn/kfqy/" values = { "entname":"","levelno":"-1"} #levelno=-1:资质不限 data = urllib.urlencode(values) pageNum = 21 entlist = [] while True: if pageNum == 0: param1 = "" else: param1 = "_" + str(pageNum) url = rooturl + "index" + param1 + ".shtml" pageNum += 1 geturl = url + "?"+data request = urllib2.Request(geturl) response = urllib2.urlopen(request) page = response.read() res = getlist(page) if len(res) == 0: break entlist += getlist(page) #将企业信息写入数据库 import MySQLdb ip = 'localhost' username = 'root' password = '***' dbname = 'test' conn = MySQLdb.connect(ip, username, password, dbname, charset='utf8') cursor = conn.cursor() print entlist[1] #try: # sql = "insert into fdc_ent_info value (%(序号)s, %(企业名称)s, %(法人代表)s, %(资质编号)s, %(资质等级)s, %(url)s)" # cursor.executemany(sql, entlist) # conn.commit() #except: # import traceback # traceback.print_exc() # conn.rollback() #finally: # cursor.close() # conn.close() print 'file...' f = file("d:\\entinfo.txt", 'w') for e in entlist: le = "" for key,value in e.items(): le += key + ":" + value + ", " le = le[:-2] le += '\n' f.write(le) f.flush() f.close print 'done'

你可能感兴趣的:(Python/爬虫)