python自动采集入库

# coding:utf-8
'''功能:采集百度新闻(http://news.baidu.com/)内容,百度新闻聚合了许多行业网站的新闻,已经帮我们去重筛选了,采集自己行业的新闻数据很不错。
 主要思路:1,利用字典把各个网站的网址与正则及网页编码对应起来 2,把采集过得url放到一个文件中,判断是否采集过 3,百度新闻5分钟跟新一次,可以再建个程序每隔几分钟运行一次
'''
import pycurl,StringIO,json,urllib,urllib2,re
import MySQLdb
import time 
from warnings import filterwarnings
import MySQLdb as Database
filterwarnings('ignore', category = Database.Warning) 
import sys
reload(sys)
sys.setdefaultencoding('utf8')


headers = [
  "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
	"Cookie: spversion=20130314; historystock=603158%7C*%7C1A0001%7C*%7C000967%7C*%7C603328; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1467682875,1467682943,1467682974,1468293176; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1468293226",
]

def curl(url):
	c = pycurl.Curl()	#通过curl方法构造一个对象
	#c.setopt(pycurl.REFERER, 'http://qy.m.58.com/')	#设置referer
	c.setopt(pycurl.FOLLOWLOCATION, True)	#自动进行跳转抓取
	c.setopt(pycurl.MAXREDIRS,5)			#设置最多跳转多少次
	c.setopt(pycurl.CONNECTTIMEOUT, 60)		#设置链接超时
	c.setopt(pycurl.TIMEOUT,120)			#下载超时
	c.setopt(pycurl.ENCODING, 'gzip,deflate')
	# c.setopt(c.PROXY,ip)	# 代理
	c.fp = StringIO.StringIO()	
	c.setopt(pycurl.URL, url)	#设置要访问的URL
	c.setopt(pycurl.HTTPHEADER,headers)		#传入请求头
	# c.setopt(pycurl.POST, 1)
	# c.setopt(pycurl.POSTFIELDS, data)		#传入POST数据
	c.setopt(c.WRITEFUNCTION, c.fp.write)	#回调写入字符串缓存
	c.perform()

	code = c.getinfo(c.HTTP_CODE)	#返回状态码
	html = c.fp.getvalue()	#返回源代码
	return html

# 通过正则提取元素
def search(req,html):
    text = re.search(req,html)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data

# 去除文章url、多余标签等、补全路径等
def content_sort(content):
	content = re.sub('','

',content,flags=re.I) content = re.sub('','

',content) content = re.sub('','',content) content = re.sub('','',content) content = re.sub('','',content) content = re.sub('','',content,re.IGNORECASE) content = re.sub('','',content,re.IGNORECASE) content = re.sub('','',content,flags=re.I) content = re.sub('','',content) content = re.sub('','',content) content = re.sub('','',content) content = re.sub('','',content) content = re.sub('[\s\S]*?','',content) content = re.sub('','',content) content = re.sub('','',content) content = re.sub('[\s\S]*?','',content) content = re.sub(' ','',content) content = re.sub(' ','',content) content = re.sub(' ','',content) return content #域名与正则、编码对应表 req_dict = { 'finance.sina.com.cn': {'title':'(.*?)','content':'([\s\S]*?)','decode':'utf-8'}, 'stock.eastmoney.com': {'title':'(.*?)','content':'

([\s\S]*?)
','decode':'gbk'}, 'finance.eastmoney.com': {'title':'(.*?)','content':'
([\s\S]*?)
','decode':'gbk'},#ok 'guba.eastmoney.com': {'title':'(.*?)_.*?','content':'
([\s\S]*?)
','decode':'utf-8'},#ok 'stock.jrj.com.cn': {'title':'(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'}, 'hk.jrj.com.cn': {'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'}, 'hkstock.cnfol.com': {'title':'<title>(.*?)_.*?','content':'
([\s\S]*?)','decode':'utf-8'},#ok 'sc.stock.cnfol.com': {'title':'(.*?)_.*?','content':'
([\s\S]*?)','decode':'utf-8'},#ok 'money.163.com': {'title':'(.*?)_.*?','content':'
([\s\S]*?)','decode':'utf-8'}, 'www.chinastock.com.cn': {'title':'
([\s\S]*?)
','content':'
([\s\S]*?)
','decode':'utf-8'}, 'stock.huagu.com': {'title':'

([\s\S]*?)

','content':'
([\s\S]*?)
','decode':'utf-8'}, 'stock.sohu.com': {'title':'

([\s\S]*?)

','content':'
([\s\S]*?)
(.*?)-.*?','content':'
([\s\S]*?)
','decode':'utf-8'}, 'hk.stock.hexun.com': {'title':'(.*?)[-_|].*?','content':'
([\s\S]*?)
','decode':'utf-8'}, 'stock.gucheng.com': {'title':'(.*?)[-_|].*?','content':'
([\s\S]*?)
','decode':'utf-8'}, 'www.cnstock.com': {'title':'(.*?)-.*?','content':'
([\s\S]*?)
','decode':'gbk'}, 'www.ccstock.cn': {'title':'(.*?)-.*?','content':'
([\s\S]*?)
','decode':'utf-8'}, 'news.emoney.cn': {'title':'(.*?)-.*?','content':'
([\s\S]*?)