Reuters-21578 数据集处理

具体处理方法如下:

处理后的语料下载地址

点击打开链接

from time import strptime
import csv
import numpy

article_components = ['DATE', 'PLACES', 'DATELINE', 'TOPICS', 'PEOPLE', 'ORGS', 'EXCHANGES',
'COMPANIES', 'TITLE', 'BODY','HOUR']

files = ['reut2-000.sgm', 'reut2-001.sgm', 'reut2-002.sgm', 'reut2-003.sgm',
'reut2-004.sgm', 'reut2-005.sgm', 'reut2-006.sgm', 'reut2-007.sgm', 'reut2-008.sgm',
'reut2-009.sgm', 'reut2-010.sgm', 'reut2-011.sgm', 'reut2-012.sgm', 'reut2-013.sgm',
'reut2-014.sgm', 'reut2-015.sgm', 'reut2-016.sgm', 'reut2-017.sgm', 'reut2-018.sgm',
'reut2-019.sgm', 'reut2-020.sgm', 'reut2-021.sgm']

def extract_text_between_tags(tag_word, string):
	try:
		start = string.index('<'+tag_word+'>') + 2 + len(tag_word)
		end = string.rindex('')
		if string[start:end] != "":
			return string[start:end]
		else:
			return None
	except ValueError:
		return None

def extract_id(string):
	try:
		start = string.index('NEWID="') + len('NEWID="')
		end = start + 1
		while 47 < ord(string[end]) < 58:
			end += 1
		return int(string[start:end])
	except ValueError:
		return None

def clean_string(string):
	return string.replace('\n', ' ')

def structure_time_date(string):
	# I need to slice of here because one day did on tidily report the date. 
	adjusted_string = string.strip()[:23]
	return strptime(adjusted_string, "%d-%b-%Y %H:%M:%S.%f")

def get_article_components(raw_string):
	article_data = dict((key, None) for key in article_components)
	article_string = clean_string(raw_string)
	
	for key in article_data:
		text = extract_text_between_tags(key, article_string)
		if text:
			if '' and '' in text:
				text = text.replace('', '').replace('', ' ').split()
		article_data[key] = text
	article_data['ID'] = extract_id(article_string)
	return article_data


def save_data(data):
	with open('csv_test.csv', 'wb') as csvfile:
		fieldnames = ['BODY', 'HOUR', 'PLACES', 'PEOPLE', 'YEAR', 'DATELINE', 'TITLE', 'COMPANIES', 'ID', 'DAY_Y', 'DAY_M', 'EXCHANGES', 'DATE', 'ORGS', 'MONTH', 'TOPICS']
		writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
		writer.writeheader()
		for line in data:
			writer.writerow(line)
		print "finished"

def get_data():
	data = []
	for file in files:
		open_file = open(file).read().split('')
		for raw_article in open_file:
			if '


你可能感兴趣的:(技术-其他)