这个是我的github上的代码库,欢迎大家点星!
下面我们一步步来完善我们的程序
#!/usr/bin/env python
# coding=utf-8
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from News_Scrapy.items import NewsScrapyItem
from scrapy.conf import settings
import os,pickle,signal
import sys
## MySelf define the Global Variable
SAVED_URL = set()
if os.path.isfile(settings["SAVED_URL_PATH"]):
with open(settings["SAVED_URL_PATH"],"rb") as handle:
SAVED_URL = pickle.load(handle)
def save_url_pkl(sig,frame):
with open(settings["SAVED_URL_PATH"],"wb") as handle:
pickle.dump(SAVED_URL,handle)
sys.exit(0)
signal.signal(signal.SIGINT,save_url_pkl)
class NetEaseSpider(CrawlSpider):
name = "News_Scrapy"
allowed_domains = ["news.163.com"]
start_urls = ["http://news.163.com/domestic/","http://news.163.com/world/","http://news.163.com/shehui/","http://war.163.com/","http://gov.163.com/"]
rules = [
Rule(SgmlLinkExtractor(allow=(r'http://news.163.com/[0-9]{2}/[0-9]{3,4}/[0-9]{1,2}/[a-zA-Z0-9]+.html')),callback="parse_item"),
Rule(SgmlLinkExtractor(allow=(r'http://war.163.com/[0-9]{2}/[0-9]{3,4}/[0-9]{1,2}/[a-zA-Z0-9]+.html')),callback="parse_item"),
Rule(SgmlLinkExtractor(allow=(r'http://gov.163.com/[0-9]{2}/[0-9]{3,4}/[0-9]{1,2}/[a-zA-Z0-9]+.html')),callback="parse_item"),
]
def parse_item(self,response):
if response.url not in SAVED_URL:
SAVED_URL.add(response.url)
sel_resp = Selector(response)
news_item = NewsScrapyItem()
news_item["news_title"] = sel_resp.xpath('//*[@id="h1title"]/text()').extract()
news_item["news_date"] = sel_resp.xpath('//*[@id="epContentLeft"]/div[1]/div[1]/text()').re(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}')
news_item["news_source"] = sel_resp.xpath('//*[@id="ne_article_source"]/text()').extract()
news_item["news_content"] = sel_resp.xpath('//*[@id="endText"]').extract()
return news_item
如果直接使用pip或者easy_install不能直接安装,我们手动安装
fighter@pc:~$ ipython #使用ipython
In [1]: import site; site.getsitepackages()
Out[1]: ['/usr/local/lib/python2.7/dist-packages', '/usr/lib/python2.7/dist-packages']
fighter@pc:/usr/local/lib/python2.7/dist-packages$ sudo git clone https://github.com/fxsjy/jieba.git
fighter@pc:~/Downloads$ git clone https://github.com/fxsjy/jieba.git
Cloning into 'jieba'...
remote: Counting objects: 2287, done.
remote: Total 2287 (delta 0), reused 0 (delta 0), pack-reused 2287
Receiving objects: 100% (2287/2287), 39.72 MiB | 3.12 MiB/s, done.
Resolving deltas: 100% (1293/1293), done.
Checking connectivity... done.
fighter@pc:~/Downloads$ ls
jieba
fighter@pc:~/Downloads$ cd jieba/
fighter@pc:~/Downloads/jieba$ ls
Changelog extra_dict jieba LICENSE MANIFEST.in README.md setup.py test
fighter@pc:~/Downloads/jieba$ sudo python setup.py install
......
fighter@pc:~/Downloads/jieba$ ls /usr/local/lib/python2.7/dist-packages/
jieba jieba-0.38.egg-info s s-2.8.2.egg-info
碰到这种蛋疼的问题,真的是只有使用正则表达式了!但是,还有有个神器是BeautifulSoup,这个可确实帮了我的大忙,不然真的会写晕的!
具体的BeautifulSoup的使用可以参考我的一个小工程,因此整个parse部分直接使用BeautifulSoup来搞定。具体代码如下所示;
def parse_item(self,response):
if response.url not in SAVED_URL:
SAVED_URL.add(response.url)
soup = BeautifulSoup(response.body)
news_item = NewsScrapyItem()
news_item["news_title"] = soup.find("title").string
new_date_list = soup.findAll("div",{"class":["ep-time-soure cDGray","pub_time"]})
news_date_re = re.findall(r"\d{2}/\d{4}/\d{2}",response.url)[0].split("/")
news_item["news_date"] = "20" + news_date_re[0] + "-" + news_date_re[1][:2] + "-" + news_date_re[1][-2:] + " " + news_date_re[2]
if len(new_date_list) != 0:
news_item["news_date"] = new_date_list[0].string[:19]
tmp_news_source = soup.find("a",{"id":"ne_article_source"})
if tmp_news_source != None:
news_item["news_source"] = tmp_news_source.string
else:
news_item["news_source"] = "NetEase"
data = soup.findAll("div",{"id":"endText"})[0]
data_list = data.findAll("p",{"class":""})
contents = ""
for item in data_list:
if type(item.string) != types.NoneType:
test = item.string.encode("utf-8")
contents = contents + test
news_item["news_content"] = contents
return news_item
先去熟悉一下jieba分词这个库该如何使用,具体请见github上的demo程序,我这里仅仅显示出我需要的部分功能
#encoding=utf-8
from __future__ import unicode_literals
import sys
sys.path.append("../")
import jieba
import jieba.posseg
import jieba.analyse
print('='*40)
print('1. 分词')
print('-'*40)
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print(", ".join(seg_list))
s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
for x, w in jieba.analyse.extract_tags(s, withWeight=True):
print('%s %s' % (x, w))
print('-'*40)
print(' TextRank')
print('-'*40)
for x, w in jieba.analyse.textrank(s, withWeight=True):
print('%s %s' % (x, w))
print
我们只要将上述的代码中的内容部分替换成我们的新闻内容,稍作修改就可以使用了!
key_map = {}
for x,w in jieba.analyse.extract_tags(contents,withWeight=True):
key_map[x] = w
news_item["news_key"] = json.dumps(key_map)
class NetEaseSpider(CrawlSpider):
name = "News_Scrapy"
allowed_domains = ["news.163.com"]
start_urls = ["http://news.163.com","http://news.163.com/domestic/","http://news.163.com/world/","http://news.163.com/shehui/","http://war.163.com/","http://gov.163.com/"]
rules = [
Rule(SgmlLinkExtractor(allow=(r'http://[a-z]+.163.com/[a-z]*')),callback="parse_item"),
Rule(SgmlLinkExtractor(allow=(r'http://[a-z]+.163.com/[0-9]{2}/[0-9]{3,4}/[0-9]{1,2}/[a-zA-Z0-9]+.html')),callback="parse_item_yield"),
]
detail_re = re.compile(r'http://[a-z]+.163.com/[0-9]{2}/[0-9]{3,4}/[0-9]{1,2}/[a-zA-Z0-9]+.html')
head_re = re.compile(r'http://[a-z]+.163.com')
def parse_item(self,response):
if response.url not in SAVED_URL:
SAVED_URL.add(response.url)
soup = BeautifulSoup(response.body)
for item in soup.findAll("a"):
if item.has_attr("href"):
head_url_list = re.findall(self.head_re,item["href"])
detail_url_list = re.findall(self.detail_re,item["href"])
if type(head_url_list) != types.NoneType:
for tmp in head_url_list:
if tmp not in SAVED_URL:
yield Request(tmp,callback=self.parse_item_yield)
if type(detail_url_list) != types.NoneType:
for tmp in detail_url_list:
if tmp not in SAVED_URL:
yield Request(tmp,callback=self.parse_item_yield)
def parse_item_yield(self,response):
if response.url not in SAVED_URL:
SAVED_URL.add(response.url)
soup = BeautifulSoup(response.body)
news_item = NewsScrapyItem()
news_item["news_title"] = u"网易新闻"
if type(soup.find("title")) != types.NoneType:
news_item["news_title"] = soup.find("title").string
new_date_list = soup.findAll("div",{"class":["ep-time-soure cDGray","pub_time"]})
news_date_re = re.findall(r"\d{2}/\d{4}/\d{2}",response.url)[0].split("/")
news_item["news_date"] = "20" + news_date_re[0] + "-" + news_date_re[1][:2] + "-" + news_date_re[1][-2:] + " " + news_date_re[2]
if len(new_date_list) != 0:
news_item["news_date"] = new_date_list[0].string[:19]
tmp_news_source = soup.find("a",{"id":"ne_article_source"})
if tmp_news_source != None:
news_item["news_source"] = tmp_news_source.string
else:
news_item["news_source"] = "NetEase"
data = soup.findAll("div",{"id":"endText"})[0]
data_list = data.findAll("p",{"class":""})
contents = ""
for item in data_list:
if type(item.string) != types.NoneType:
test = item.string.encode("utf-8")
contents = contents + test
news_item["news_content"] = contents
key_map = {}
for x,w in jieba.analyse.extract_tags(contents,withWeight=True):
key_map[x] = w
news_item["news_key"] = json.dumps(key_map)
yield news_item
for item in soup.findAll("a"):
if item.has_attr("href"):
head_url_list = re.findall(self.head_re,item["href"])
detail_url_list = re.findall(self.detail_re,item["href"])
if type(head_url_list) != types.NoneType:
for tmp in head_url_list:
if tmp not in SAVED_URL:
yield Request(tmp,callback=self.parse_item_yield)
if type(detail_url_list) != types.NoneType:
for tmp in detail_url_list:
if tmp not in SAVED_URL:
yield Request(tmp,callback=self.parse_item_yield)
经过这样的修改,每次大约可以抓取几个小时,并且一次可以抓取的有效信息大约在4000条记录左右
这样就可以让云服务器自己定时去抓取了!
def load_News_DB():
contents = cursor.find({},{"news_title":1,"news_key":1,"_id":0})
index = 0
key_index_map = {}
for item in contents:
index = index + 1
if item.has_key("news_key"):
news_key_str = item["news_key"]
news_key = json.loads(news_key_str)
for word in news_key.keys():
if word.isdigit() == False:
if key_index_map.has_key(word):
key_index_map[word].append((item["news_title"],news_key[word]))
else:
key_index_map[word] = [(item["news_title"],news_key[word])]
if index % 10000 == 0:
with open("index_"+str(index) + ".json",'w') as handle:
for tmp in key_index_map.keys():
tmp_list = sorted(key_index_map[tmp],key = lambda x:x[1],reverse = False)
key_index_map[tmp] = []
for title in tmp_list:
key_index_map[tmp].append(title[0])
handle.write(json.dumps(key_index_map))
key_index_map = {}
with open("index_"+str(index) + ".json",'w') as handle:
for tmp in key_index_map.keys():
tmp_list = sorted(key_index_map[tmp],key = lambda x:x[1],reverse = False)
key_index_map[tmp] = []
for title in tmp_list:
key_index_map[tmp].append(title[0])
handle.write(json.dumps(key_index_map))
但是由于数据量太大,则将数据按照索引次序进行排列,按照10000条记录分开存储,之后发现,按照json文本存储确实很费空间,处理也不方便,
便使用pkl形式,将map直接固话到本地。存储的信息
#-*- coding:utf-8 -*-
import web
import json,os
import pickle
urls = (
"/","index"
)
class index:
def GET(self):
web.header('Access-Control-Allow-Origin','*')
web.header('Content-Type','application/json')
befor_data = web.input()
data = None
if befor_data.has_key("CONTENTS"):
if befor_data["CONTENTS"] == "SEARCH":
key_info = befor_data["QUERY"]
if len(key_info) == 0:
return json.dumps({"name":"search","code":"1","status":"No Query Word"})
data = self.search_related(key_info)
return json.dumps(data)
def POST(self):
web.header('Access-Control-Allow-Origin','*')
web.header('Access-Control-Allow-Headers', 'content-type')
web.header('Content-Type','application/json')
print web.input()
data = self.getdata()
return data
def OPTIONS(self):
web.header('Access-Control-Allow-Origin','*')
web.header('Access-Control-Allow-Headers', 'content-type')
web.header('Content-Type','application/json')
return
def search_related(self,key_word):
result = {'name':'search',"data":[]}
result["data"] = self.read_news_index(key_word)
return result
def read_news_index(self,key_word):
base_dir = "../../News_Index/"
title_list = []
for filename in os.listdir(base_dir):
if filename.endswith("pkl"):
with open(base_dir + filename,"rb") as handle:
index_map = pickle.load(handle)
if index_map.has_key(key_word):
title_list.extend(index_map[key_word])
if len(title_list) >= 10:
break
return title_list
if __name__ == "__main__":
web.config.debug = False
app = web.application(urls,globals())
app.run()
前端的具体信息通信请看JS脚本