python处理人民日报语料库

# encoding: utf-8

"""
author: Leo
date: 2017-6-27
"""
from __future__ import unicode_literals  # compatible with python3 unicode

import codecs
import sys
from sys import argv

def character_tagging(input_file, output_file):
	input_data = codecs.open(input_file, 'r', 'utf-8')
	output_data = codecs.open(output_file, 'w', 'utf-8')
	for line in input_data.readlines():
		# 移除字符串的头和尾的空格。strip()方法默认是移除空格的
		word_list = line.strip().split()
		for word in word_list:
			words = word.split("/")
			word = words[0]
			if len(word) == 1:
				output_data.write(word + " "),
			elif len(word) >= 2:
				output_data.write(word[0]),
				for w in word[1: len(word)-1]:
					output_data.write(w),
				output_data.write(word[len(word)-1] + " "),
		output_data.write("\n")
	input_data.close()
	output_data.close()

if __name__ == '__main__':
	if len(sys.argv) != 3:
		print (argv[0])
		sys.exit(-1)
	input_file = sys.argv[1]
	output_file = sys.argv[2]
	character_tagging(input_file, output_file)

你可能感兴趣的:(Python学习)