def clean_text(text):
Clean text
:param text: the string of text
:return: text string after cleaning
# unit
text = re.sub(r"(\d+)kgs ", lambda m: m.group(1) + ' kg ', text) # e.g. 4kgs => 4 kg
text = re.sub(r"(\d+)kg ", lambda m: m.group(1) + ' kg ', text) # e.g. 4kg => 4 kg
text = re.sub(r"(\d+)k ", lambda m: m.group(1) + '000 ', text) # e.g. 4k => 4000
text = re.sub(r"\$(\d+)", lambda m: m.group(1) + ' dollar ', text)
text = re.sub(r"(\d+)\$", lambda m: m.group(1) + ' dollar ', text)
# acronym
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"cannot", "can not ", text)
text = re.sub(r"what\'s", "what is", text)
text = re.sub(r"What\'s", "what is", text)
text = re.sub(r"\'ve ", " have ", text)
text = re.sub(r"n\'t", " not ", text)
text = re.sub(r"i\'m", "i am ", text)
text = re.sub(r"I\'m", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"c\+\+", "cplusplus", text)
text = re.sub(r"c \+\+", "cplusplus", text)
text = re.sub(r"c \+ \+", "cplusplus", text)
text = re.sub(r"c#", "csharp", text)
text = re.sub(r"f#", "fsharp", text)
text = re.sub(r"g#", "gsharp", text)
text = re.sub(r" e mail ", " email ", text)
text = re.sub(r" e \- mail ", " email ", text)
text = re.sub(r" e\-mail ", " email ", text)
text = re.sub(r",000", '000', text)
text = re.sub(r"\'s", " ", text)
# spelling correction
text = re.sub(r"ph\.d", "phd", text)
text = re.sub(r"PhD", "phd", text)
text = re.sub(r"pokemons", "pokemon", text)
text = re.sub(r"pokémon", "pokemon", text)
text = re.sub(r"pokemon go ", "pokemon-go ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" 9 11 ", " 911 ", text)
text = re.sub(r" j k ", " jk ", text)
text = re.sub(r" fb ", " facebook ", text)
text = re.sub(r"facebooks", " facebook ", text)
text = re.sub(r"facebooking", " facebook ", text)
text = re.sub(r"insidefacebook", "inside facebook", text)
text = re.sub(r"donald trump", "trump", text)
text = re.sub(r"the big bang", "big-bang", text)
text = re.sub(r"the european union", "eu", text)
text = re.sub(r" usa ", " america ", text)
text = re.sub(r" us ", " america ", text)
text = re.sub(r" u s ", " america ", text)
text = re.sub(r" U\.S\. ", " america ", text)
text = re.sub(r" US ", " america ", text)
text = re.sub(r" American ", " america ", text)
text = re.sub(r" America ", " america ", text)
text = re.sub(r" quaro ", " quora ", text)
text = re.sub(r" mbp ", " macbook-pro ", text)
text = re.sub(r" mac ", " macbook ", text)
text = re.sub(r"macbook pro", "macbook-pro", text)
text = re.sub(r"macbook-pros", "macbook-pro", text)
text = re.sub(r" 1 ", " one ", text)
text = re.sub(r" 2 ", " two ", text)
text = re.sub(r" 3 ", " three ", text)
text = re.sub(r" 4 ", " four ", text)
text = re.sub(r" 5 ", " five ", text)
text = re.sub(r" 6 ", " six ", text)
text = re.sub(r" 7 ", " seven ", text)
text = re.sub(r" 8 ", " eight ", text)
text = re.sub(r" 9 ", " nine ", text)
text = re.sub(r"googling", " google ", text)
text = re.sub(r"googled", " google ", text)
text = re.sub(r"googleable", " google ", text)
text = re.sub(r"googles", " google ", text)
text = re.sub(r" rs(\d+)", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"(\d+)rs", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"dollars", " dollar ", text)
# punctuation
text = re.sub(r"\+", " + ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"-", " - ", text)
text = re.sub(r"/", " / ", text)
text = re.sub(r"\\", " \ ", text)
text = re.sub(r"=", " = ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"\.", " . ", text)
text = re.sub(r",", " , ", text)
text = re.sub(r"\?", " ? ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\"", " \" ", text)
text = re.sub(r"&", " & ", text)
text = re.sub(r"\|", " | ", text)
text = re.sub(r";", " ; ", text)
text = re.sub(r"\(", " ( ", text)
text = re.sub(r"\)", " ( ", text)
# symbol replacement
text = re.sub(r"&", " and ", text)
text = re.sub(r"\|", " or ", text)
text = re.sub(r"=", " equal ", text)
text = re.sub(r"\+", " plus ", text)
text = re.sub(r"₹", " rs ", text) # 测试!
text = re.sub(r"\$", " dollar ", text)
# remove extra space
text = ' '.join(text.split())
return text
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
Text: The original word text.
Lemma: The base form of the word.
POS: The simple part-of-speech tag.
Tag: The detailed part-of-speech tag.
Dep: Syntactic dependency, i.e. the relation between tokens.
Shape: The word shape – capitalisation, punctuation, digits.
is alpha: Is the token an alpha character?
is stop: Is the token part of a stop list, i.e. the most common words of the language?
import json
import string
import nltk
from my_method import noun_chunk
from nltk.corpus import stopwords
from my_method import get_lemma
from my_method import get_tokens
#read data
with open('data/cristic_consensus.json','r') as f:
consensus = json.load(f)
with open('data/cristic.json','r') as f:
cristic = json.load(f)
#tokenize(rmove stopwords)
cristic_token = []
for item in cristic:
temp = []
for item_01 in item:
temp_01 = get_lemma(item_01)
temp_01 = get_tokens(item_01)
with open('data/cristic_token.json','w') as f:
json.dump(cristic_token, f)
consensus_token = []
for item in consensus:
temp = get_lemma(item)
temp = get_tokens()
with open('data/consensus_token.json','w') as f:
json.dump(consensus_token, f)
#remove stopword
critics_temp = []
critics = [[x.lower() for x in c] for c in cristic]
for item in critics:
temp = [''.join(c for c in item_01 if c not in string.punctuation) for item_01 in item]
critics = critics_temp
critics = [[nltk.word_tokenize(x) for x in item] for item in critics]
critics = [[' '.join(c for c in item if c not in stopwords.words('english')) for item in item_01] for item_01 in critics]
with open('data/cristic_no_stop.json', 'w') as f:
#get noun
consensus_noun_chunk = [noun_chunk(item) for item in consensus]
with open('data/consensus_noun_chunk.json','w') as f:
json.dump(consensus_noun_chunk, f)
cristic_noun_chunk = []
for _ in cristic:
temp_01 = [noun_chunk(item) for item in _]
with open('data/cristic_noun_chunk.json','w') as f:
json.dump(cristic_noun_chunk, f)
import nltk
import string
from nltk.corpus import stopwords
import spacy
import re
from nltk.stem.porter import *
nlp = spacy.load('en_core_web_lg')
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
return stemmed
def ie_process(document):
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
return sentences
def get_tokens(document):
document = document.lower()
document = ''.join(c for c in document if c not in string.punctuation)
document = nltk.word_tokenize(document)
document = [c for c in document if c not in stopwords.words('english')]
#stemmer = PorterStemmer()
#document = stem_tokens(document, stemmer)
return document
def get_lemma(document):
document = nlp(document)
document = ' '.join(token.lemma_ for token in document)
return document
def noun_chunk(document):
doc = nlp(document)
document = [item.text for item in doc.noun_chunks]
return document
def clean_title(document):
document = re.split(r'[_-]', document)
return document
import json
from textblob import TextBlob
sentiment_test = []
noun_phrases_test = []
with open('rottentomatoes.json', 'r') as f:
data_all = json.load(f)
data_test = data_all[0]
data_test = data_test['_critics']
data_test = list(data_test.values())
for review in data_test:
testimonial = TextBlob(review)
In [2]: from textblob import TextBlob
...: testimonial = TextBlob("Textblob is amazingly simple to use. What great
...: fun!")
...: print(testimonial.sentiment)
Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)
使用TextBlob情感分析的结果,以元组的方式进行返回,形式如(polarity, subjectivity). 其中polarity的分数是一个范围为 [-1.0 , 1.0 ] 浮点数, 正数表示积极,负数表示消极。subjectivity 是一个 范围为 [0.0 , 1.0 ] 的浮点数,其中 0.0 表示 客观,1.0表示主观的。
with open('data/cristic_token.json') as f:
cristic = json.load(f)
with open('data/consensus_token.json') as f:
consensus = json.load(f)
overlap_total = []
for n in range(3731):
overlap = [list(set(consensus[n]).intersection(set(item))) for item in cristic[n]]
import spacy
nlp = spacy.load('en_core_web_md') # make sure to use larger model!
tokens = nlp(u'dog cat banana')
for token1 in tokens:
for token2 in tokens:
print(token1.text, token2.text, token1.similarity(token2))
dog dog 1.0
dog cat 0.80168545
dog banana 0.24327646
cat dog 0.80168545
cat cat 1.0
cat banana 0.2815437
banana dog 0.24327646
banana cat 0.2815437
banana banana 1.0
nlp = spacy.load('en')