main方法
# -*- coding:utf-8 -*- __author__ = 'zhaochaoye' import utlis import urllib import MySQLdb import jieba if __name__ == "__main__": # generate start url list start_urls = [] for i in range(100, 3750): url = "http://zxyxpt.suda.edu.cn/Detail.aspx?id="+str(i) start_urls.append(url) newsSpider = utlis.NewsSpider(start_urls) newsSpider.parse() host, user, pwd, db = "localhost", "root", "root", "databaseA" conn = MySQLdb.connect(host, user, pwd, db, charset='utf8') # 获取cursor对象来进行操作 cursor = conn.cursor() # 设置数据库编码格式 cursor.execute("SET NAMES utf8") cursor.execute("SET CHARACTER_SET_CLIENT=utf8") cursor.execute("SET CHARACTER_SET_RESULTS=utf8") for record in newsSpider.records: print record #编写sql语句 sql = "INSERT INTO yixiao VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (record.ID, record.category, record.state,record.time,record.TitleLB,record.ContentLB,record.ReplyLB,record. Retime) #执行sql语句 try: cursor.execute(sql) conn.commit() except: pass # 关闭数据库 cursor.close() conn.close()
utlis:
# -*- coding:utf-8 -*- __author__ = 'zhaochaoye' from bs4 import BeautifulSoup import urllib import re import time import random class Record(object): def __init__(self, ID, category,state,time,TitleLB,ContentLB,ReplyLB,Retime): self.ID = ID self.category = category self.state = state self.time = time self.TitleLB = TitleLB self.ContentLB = ContentLB self.ReplyLB =ReplyLB self.Retime = Retime def __str__(self): return str(self.ID)+","+str(self.category.encode("gbk"))+","+str(self.state.encode("gbk"))+","+str(self.time.encode("gbk"))+","+str(self.TitleLB.encode("gbk"))+","+str(self.ContentLB.encode("gbk"))+","+str(self.ReplyLB.encode("gbk"))+","+str(self.Retime.encode("gbk")) def __repr__(self): return str(self.ID)+","+str(self.category.encode("gbk"))+","+str(self.state.encode("gbk"))+","+str(self.time.encode("gbk"))+","+str(self.TitleLB.encode("gbk"))+","+str(self.ContentLB.encode("gbk"))+","+str(self.ReplyLB.encode("gbk"))+","+str(self.Retime.encode("gbk")) class UrlParser(object): # 通过urllib访问页面,获取返回状态、头信息及页面内容 def __init__(self, url): self.url = url self.head_info = urllib.urlopen(url).info() self.status = urllib.urlopen(url).getcode() self.content = urllib.urlopen(url).read() # 页面URL抽取器,从页面中抽取[编号/议案类型/状态/提交时间/标题/内容/会办意见/回复时间] def url_extractor(self): # extract ID ID = self.extract_ID() # extract category category = self.extract_category() state = self.extract_state() time = self.extract_time() TitleLB = self.extract_TitleLB() ContentLB =self.extract_ContentLB() ReplyLB = self.extract_ReplyLB() Retime = self.extract_Retime() return Record(ID, category,state,time,TitleLB,ContentLB,ReplyLB,Retime) def extract_ID(self): soup = BeautifulSoup(self.content) id_span = soup.find("span", id="idlb") return id_span.get_text() def extract_category(self): soup = BeautifulSoup(self.content) id_span = soup.find("span", id="list_idLB") return id_span.get_text() def extract_state(self): soup = BeautifulSoup(self.content) id_span =soup.find(id="stateLB") return id_span.get_text() def extract_time(self): soup = BeautifulSoup(self.content) id_span =soup.find(id="TimeLB") return id_span.get_text() def extract_TitleLB(self): soup = BeautifulSoup(self.content) id_span =soup.find(id="TitleLB") return id_span.get_text() def extract_ContentLB(self): soup = BeautifulSoup(self.content) id_span =soup.find(id="ContentLB") return id_span.get_text() def extract_ReplyLB(self): soup = BeautifulSoup(self.content) id_span =soup.find(id="ReplyLB") return id_span.get_text() def extract_Retime(self): soup = BeautifulSoup(self.content) id_span =soup.find(id="RetimeLB") return id_span.get_text() class NewsSpider(object): # 初始化需要采集的游记列表页面URL def __init__(self, crawl_urls): self.crawl_urls = crawl_urls self.records = [] # 解析页面获取游记页面URL def parse(self): print "crawling travel urls" cout = 0 for url in self.crawl_urls: url_parser = UrlParser(url) self.records.append(url_parser.url_extractor()) time.sleep(random.random()/10) cout += 1 print cout, url