实验要求:收集10类文本,每类文本包含100000,总计100万条数据,利用朴素贝叶斯或SVM进行文本分类。
收集数据(爬虫)
由于新闻类的实验数据比较容易收集,我们就从各大新闻网站如新浪,中国新闻网等爬取了十类文本数据,分别是军事、汽车、金融、教育、游戏、健康、IT、体育、娱乐、时尚十类新闻文章,爬取的每一类数据多余10万,因为我们自己利用scrapy框架写的爬虫程序执行效率比较差,且有的新闻文章正文内容比较有限,分词后不能确定是否还有数据保留下来。
以下是爬虫代码:
-------items.py,
对每一类新闻定义了一个类,爬取的数据包括标题,URL,以及正文内容。
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SportsItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
pass
class EconomyItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
pass
class PoliItem(scrapy.Item):
No = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class CultureItem(scrapy.Item):
No = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class EduItem(scrapy.Item):
No = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class ArmyItem(scrapy.Item):
No = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class SciItem(scrapy.Item):
No = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class TrendItem(scrapy.Item):
No = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class GameItem(scrapy.Item):
No = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class YuleItem(scrapy.Item):
No = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
---------pipelines.py
每一类存储在一个TXT文件中,以编号标记每一篇文章
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from items import SportsItem,EconomyItem,PoliItem,CultureItem,EduItem,ArmyItem,SciItem,TrendItem,GameItem,YuleItem
class DatasetPipeline(object):
def process_item(self, item, spider):
if isinstance(item, SportsItem):
with open('/home/hya/DataSet/sports.txt', 'a') as fp:
fp.write(item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
return item
elif isinstance(item, EconomyItem):
with open('/home/hya/DataSet/economy.txt', 'a') as fp:
fp.write(item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
return item
elif isinstance(item, PoliItem):
with open('D:/poli.txt', 'a') as fp:
fp.write(str(item['No'])+'\n'+item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
elif isinstance(item, CultureItem):
with open('D:/poli.txt', 'a') as fp:
fp.write(str(item['No'])+'\n'+item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
elif isinstance(item, EduItem):
with open('D:/edu.txt', 'a') as fp:
fp.write(str(item['No'])+'\n'+item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
elif isinstance(item, ArmyItem):
with open('D:/army.txt', 'a') as fp:
fp.write(str(item['No'])+'\n'+item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
elif isinstance(item, SciItem):
with open('D:/sci.txt', 'a') as fp:
fp.write(str(item['No']) + '\n' + item['title'].encode('utf-8') + '\n' + item['link'].encode('utf-8') + '\n' + item['content'].encode('utf-8') + '\n\n')
elif isinstance(item, TrendItem):
with open('D:/trend.txt', 'a') as fp:
fp.write(str(item['No']) + '\n' + item['title'].encode('utf-8') + '\n' + item['link'].encode('utf-8') + '\n' + item['content'].encode('utf-8') + '\n\n')
elif isinstance(item, GameItem):
with open('D:/Data/dataset/game.txt', 'a') as fp:
fp.write(str(item['No']) + '\n' + item['title'].encode('utf-8') + '\n' + item['link'].encode('utf-8') + '\n' + item['content'].encode('utf-8') + '\n\n')
elif isinstance(item, YuleItem):
with open('D:/Data/dataset/yule.txt', 'a') as fp:
fp.write(str(item['No']) + '\n' + item['title'].encode('utf-8') + '\n' + item['link'].encode('utf-8') + '\n' + item['content'].encode('utf-8') + '\n\n')
------game.py
以下是爬取游戏的程序
# coding:utf-8
import re
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from DataSet.items import GameItem
import time
count = 0
class GameSpider(scrapy.spiders.Spider):
name = "game"
#http://roll.mil.news.sina.com.cn/col/gjjq/index.shtml
#http://www.diyiyou.com/news/gnxw/index_2863.html
s = "https://www.app178.com/xinwen_"
m = ".html"
start_urls = ["https://www.app178.com/xinwen_1.html", ]
for i in range(2, 1386):
url = s+str(i)+m
start_urls.append(url)
def parse(self, response):
selector = Selector(response)
#/html/body/div[2]/div/div[1]/ul/li[1]/div[1]/a
links = selector.xpath('//*[@class="list_left"]/ul/li/div/a/@href').extract()
print links
titles = selector.xpath('//*[@class="list_left"]/ul/li/div/a/text()').extract()
for i in range(len(links)):
h = "https://www.app178.com"
link = h + links[i].strip()
print link
title = titles[i]
yield Request(link.encode('utf-8'), meta={'title': title, 'link': link},
callback=self.parse_content) # parse content
def parse_content(self, response):
global count
item = GameItem()
item["link"] = response.meta['link']
item["title"] = response.meta['title']
#print "in parse_content"
sel = Selector(response)
#/html/body/div[7]/div/div/em/em/div[1]/p[6]/text()/html/body/div[7]/div/div/em/em/div[1]/p[6]/text()
content = sel.xpath('//*[@class="jjzq_ny_left1_main"]/p/text()').extract()
#content = content.strip()
#content = content.replace(" ","")
if len(content) != 0:
tmp = ''.join(content)
item['content'] = tmp
count = count + 1
item['No'] = count
print count
return item