import re
import numpy as np
import pandas as pd
import nltk.tokenize as tk
import nltk.corpus as nc
handel_file = 'health_handel.csv'
data=pd.read_excel('health.xlsx')
print(data.head(10))
stopwords = nc.stopwords.words('english')
tokenizer=tk.WordPunctTokenizer()
def preprocess_word(word):
word = word.strip('\'"?!,.():;')
word = re.sub(r'(.)\1+', r'\1\1', word)
word = re.sub(r'(-|\')', '', word)
return word
def is_valid_word(word):
return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)
def handle_emojis(tweet):
tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' ', tweet)
tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' ', tweet)
tweet = re.sub(r'(<3|:\*)', ' ', tweet)
tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' ', tweet)
tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' ', tweet)
tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' ', tweet)
return tweet
def clean_text(tweet):
processed_tweet = []
tweet = tweet.lower()
tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' ', tweet)
tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
tweet = re.sub(r'#(\S+)', ' ', tweet)
tweet = re.sub(r'\brt\b', ' ', tweet)
tweet = re.sub(r'\.{2,}', ' ', tweet)
tweet = tweet.strip(' "\'')
tweet = handle_emojis(tweet)
tweet = re.sub(r'\s+', ' ', tweet)
words = tweet.lower().split()
words = [w for w in words if w not in stopwords]
for word in words:
word = preprocess_word(word)
if is_valid_word(word):
processed_tweet.append(word)
return processed_tweet
data['clean_review']=data.Tweet.apply(clean_text)
data.to_csv(handel_file, index=False)
from nltk.tag import pos_tag
sentences=data.clean_review.tolist()
words=[]
for sent in sentences:
for word in sent:
words.append(word)
word_tag=pos_tag(words)
print(word_tag)
word_dict={}
for item in words:
if item not in word_dict:
word_dict[item]=1
else:
word_dict[item]+=1
key=list(word_dict.keys())
value=list(word_dict.values())
list_words=[]
list_cixing=[]
for k in word_tag:
list_words.append(k[0])
list_cixing.append(k[1])
import xlwt
workbook=xlwt.Workbook()
sheet1=workbook.add_sheet(r'sheet',cell_overwrite_ok=True)
for row in range(row_list):
for column in range(3):
if column==0:
sheet1.write(row,column,list_words[row])
elif column==1:
sheet1.write(row,column,list_cixing[row])
else:
sheet1.write(row,column,value[row])
workbook.save('分词_词性_词频.xls')
unsorted_df=pd.read_excel('分词_词性_词频.xls')
unsorted_df.head(5)
sorted_df=unsorted_df.sort_values(by='count',ascending=False)
sorted_df.head(5)
sorted_df.drop_duplicates(keep='first',inplace=True)
sorted_df
sorted_df.to_excel('分词_词性_词频2.xls',index=False)