具体处理方法如下:
处理后的语料下载地址
点击打开链接
from time import strptime
import csv
import numpy
article_components = ['DATE', 'PLACES', 'DATELINE', 'TOPICS', 'PEOPLE', 'ORGS', 'EXCHANGES',
'COMPANIES', 'TITLE', 'BODY','HOUR']
files = ['reut2-000.sgm', 'reut2-001.sgm', 'reut2-002.sgm', 'reut2-003.sgm',
'reut2-004.sgm', 'reut2-005.sgm', 'reut2-006.sgm', 'reut2-007.sgm', 'reut2-008.sgm',
'reut2-009.sgm', 'reut2-010.sgm', 'reut2-011.sgm', 'reut2-012.sgm', 'reut2-013.sgm',
'reut2-014.sgm', 'reut2-015.sgm', 'reut2-016.sgm', 'reut2-017.sgm', 'reut2-018.sgm',
'reut2-019.sgm', 'reut2-020.sgm', 'reut2-021.sgm']
def extract_text_between_tags(tag_word, string):
try:
start = string.index('<'+tag_word+'>') + 2 + len(tag_word)
end = string.rindex(''+tag_word+'>')
if string[start:end] != "":
return string[start:end]
else:
return None
except ValueError:
return None
def extract_id(string):
try:
start = string.index('NEWID="') + len('NEWID="')
end = start + 1
while 47 < ord(string[end]) < 58:
end += 1
return int(string[start:end])
except ValueError:
return None
def clean_string(string):
return string.replace('\n', ' ')
def structure_time_date(string):
# I need to slice of here because one day did on tidily report the date.
adjusted_string = string.strip()[:23]
return strptime(adjusted_string, "%d-%b-%Y %H:%M:%S.%f")
def get_article_components(raw_string):
article_data = dict((key, None) for key in article_components)
article_string = clean_string(raw_string)
for key in article_data:
text = extract_text_between_tags(key, article_string)
if text:
if '' and ' ' in text:
text = text.replace('', '').replace(' ', ' ').split()
article_data[key] = text
article_data['ID'] = extract_id(article_string)
return article_data
def save_data(data):
with open('csv_test.csv', 'wb') as csvfile:
fieldnames = ['BODY', 'HOUR', 'PLACES', 'PEOPLE', 'YEAR', 'DATELINE', 'TITLE', 'COMPANIES', 'ID', 'DAY_Y', 'DAY_M', 'EXCHANGES', 'DATE', 'ORGS', 'MONTH', 'TOPICS']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for line in data:
writer.writerow(line)
print "finished"
def get_data():
data = []
for file in files:
open_file = open(file).read().split('')
for raw_article in open_file:
if '