北邮数据挖掘与数据仓库——文本分类实验(一)

北邮数据挖掘与数据仓库——文本分类实验(一)

实验要求:收集10类文本,每类文本包含100000,总计100万条数据,利用朴素贝叶斯或SVM进行文本分类。

  1. 收集数据(爬虫)
  2. 利用中科院分词工具pynpir分词
  3. 利用sklearn计算单词的tf-idf
  4. 利用朴素贝叶斯进行文本分类

收集数据(爬虫)
由于新闻类的实验数据比较容易收集,我们就从各大新闻网站如新浪,中国新闻网等爬取了十类文本数据,分别是军事、汽车、金融、教育、游戏、健康、IT、体育、娱乐、时尚十类新闻文章,爬取的每一类数据多余10万,因为我们自己利用scrapy框架写的爬虫程序执行效率比较差,且有的新闻文章正文内容比较有限,分词后不能确定是否还有数据保留下来。
以下是爬虫代码:
-------items.py,
对每一类新闻定义了一个类,爬取的数据包括标题,URL,以及正文内容。

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SportsItem(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
    pass
class EconomyItem(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
    pass
class PoliItem(scrapy.Item):
    No = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
class CultureItem(scrapy.Item):
    No = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
class EduItem(scrapy.Item):
    No = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
class ArmyItem(scrapy.Item):
    No = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
class SciItem(scrapy.Item):
    No = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
class TrendItem(scrapy.Item):
    No = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
class GameItem(scrapy.Item):
    No = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()
class YuleItem(scrapy.Item):
    No = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    content = scrapy.Field()

---------pipelines.py
每一类存储在一个TXT文件中,以编号标记每一篇文章

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from items import SportsItem,EconomyItem,PoliItem,CultureItem,EduItem,ArmyItem,SciItem,TrendItem,GameItem,YuleItem


class DatasetPipeline(object):
    def process_item(self, item, spider):
        if isinstance(item, SportsItem):
            with open('/home/hya/DataSet/sports.txt', 'a') as fp:
                fp.write(item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
            return item
        elif isinstance(item, EconomyItem):
            with open('/home/hya/DataSet/economy.txt', 'a') as fp:
                fp.write(item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
            return item
        elif isinstance(item, PoliItem):
            with open('D:/poli.txt', 'a') as fp:
                fp.write(str(item['No'])+'\n'+item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
        elif isinstance(item, CultureItem):
            with open('D:/poli.txt', 'a') as fp:
                fp.write(str(item['No'])+'\n'+item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
        elif isinstance(item, EduItem):
            with open('D:/edu.txt', 'a') as fp:
                fp.write(str(item['No'])+'\n'+item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
        elif isinstance(item, ArmyItem):
            with open('D:/army.txt', 'a') as fp:
                fp.write(str(item['No'])+'\n'+item['title'].encode('utf-8')+'\n'+item['link'].encode('utf-8')+'\n'+item['content'].encode('utf-8')+'\n\n')
        elif isinstance(item, SciItem):
            with open('D:/sci.txt', 'a') as fp:
                fp.write(str(item['No']) + '\n' + item['title'].encode('utf-8') + '\n' + item['link'].encode('utf-8') + '\n' + item['content'].encode('utf-8') + '\n\n')
        elif isinstance(item, TrendItem):
            with open('D:/trend.txt', 'a') as fp:
                fp.write(str(item['No']) + '\n' + item['title'].encode('utf-8') + '\n' + item['link'].encode('utf-8') + '\n' + item['content'].encode('utf-8') + '\n\n')
        elif isinstance(item, GameItem):
            with open('D:/Data/dataset/game.txt', 'a') as fp:
                fp.write(str(item['No']) + '\n' + item['title'].encode('utf-8') + '\n' + item['link'].encode('utf-8') + '\n' + item['content'].encode('utf-8') + '\n\n')
        elif isinstance(item, YuleItem):
            with open('D:/Data/dataset/yule.txt', 'a') as fp:
                fp.write(str(item['No']) + '\n' + item['title'].encode('utf-8') + '\n' + item['link'].encode('utf-8') + '\n' + item['content'].encode('utf-8') + '\n\n')

------game.py
以下是爬取游戏的程序

# coding:utf-8

import re
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from DataSet.items import GameItem
import time

count = 0
class GameSpider(scrapy.spiders.Spider):
    name = "game"
    #http://roll.mil.news.sina.com.cn/col/gjjq/index.shtml
    #http://www.diyiyou.com/news/gnxw/index_2863.html
    s = "https://www.app178.com/xinwen_"
    m = ".html"
    start_urls = ["https://www.app178.com/xinwen_1.html", ]
    for i in range(2, 1386):
        url = s+str(i)+m
        start_urls.append(url)

    def parse(self, response):
        selector = Selector(response)
        #/html/body/div[2]/div/div[1]/ul/li[1]/div[1]/a
        links = selector.xpath('//*[@class="list_left"]/ul/li/div/a/@href').extract()
        print links
        titles = selector.xpath('//*[@class="list_left"]/ul/li/div/a/text()').extract()
        for i in range(len(links)):
            h = "https://www.app178.com"
            link = h + links[i].strip()
            print link
            title = titles[i]
            yield Request(link.encode('utf-8'), meta={'title': title, 'link': link},
                          callback=self.parse_content)  # parse content

    def parse_content(self, response):
        global count
        item = GameItem()
        item["link"] = response.meta['link']
        item["title"] = response.meta['title']
        #print "in parse_content"
        sel = Selector(response)
        #/html/body/div[7]/div/div/em/em/div[1]/p[6]/text()/html/body/div[7]/div/div/em/em/div[1]/p[6]/text()
        content = sel.xpath('//*[@class="jjzq_ny_left1_main"]/p/text()').extract()
        #content = content.strip()
        #content = content.replace(" ","")
        if len(content) != 0:
            tmp = ''.join(content)
            item['content'] = tmp
            count = count + 1
            item['No'] = count
            print count
            return item

你可能感兴趣的:(数据挖掘)