NLTK01 《NLTK基础教程--用NLTK和Python库构建机器学习应用》

01 关于NLTK的认知

 很多介绍NLP的,都会提到NLTK库。还以为NLTK是多牛逼的必需品。看了之后,感觉NLTK对实际项目,作用不大。很多内容都是从语义、语法方面解决NLP问题的。感觉不太靠谱。而且本身中文语料库不多。很多介绍NLTK的书籍和blog都比较陈旧。
 《NLTK基础教程--用NLTK和Python库构建机器学习应用》虽然是2017年6月第一版。但内容大部分还是很陈旧的。基本都是采用英文的素材。书中排版类、文字类错误很多。
《Python自然语言处理》 [美] Steven Bird,Ewan Klein & Edward Loper著 陈涛 张旭 催杨 刘海平 译 的介绍的更全面。代码及其陈旧,知识点很全面。

02 部分代码整理

下面整理了1、2、3、4、6、8章的代码。在win10 nltk3.2.4 python3.5.3/python3.6.1环境,可以正常运行。一定要注意nltk_data代码的下载,还有缺少库的时候,按需安装。其中 pywin32-221.win-amd64-py3.6.exe/pywin32-221.win-amd64-py3.5.exe 需要手工下载[https://sourceforge.net/projects/pywin32/files/pywin32/Build%20221/]。
需要下载的数据,都在代码里给出链接,或者说明。

02.01 自然语言处理简介(NLTKEssentials01.py)

# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》01 自然语言处理简介
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials01.py # 自然语言处理简介

import nltk
#nltk.download() # 完全下载需要很久,很可能需要多次尝试,才能下载成功
print("Python and NLTK installed successfully")
'''Python and NLTK installed successfully'''

# 1.2 先从Python开始
# 1.2.1 列表
lst = [1, 2, 3, 4]
print(lst)
'''[1, 2, 3, 4]'''

# print('Fisrt element: ' + lst[0])
# '''TypeError: must be str, not int'''

print('Fisrt element: ' + str(lst[0]))
'''Fisrt element: 1'''

print('First element: ' + str(lst[0]))
print('last element: ' + str(lst[-1]))
print('first three elemenets: ' + str(lst[0:2]))
print('last three elements: ' + str(lst[-3:]))
'''
First element: 1
last element: 4
first three elemenets: [1, 2]
last three elements: [2, 3, 4]
'''

# 1.2.2 自主功能
print(dir(lst))
'''
['__add__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort']
'''

print(' , '.join(dir(lst)))
'''
__add__ , __class__ , __contains__ , __delattr__ , __delitem__ , __dir__ , __doc__ , __eq__ , __format__ , __ge__ , __getattribute__ , __getitem__ , __gt__ , __hash__ , __iadd__ , __imul__ , __init__ , __init_subclass__ , __iter__ , __le__ , __len__ , __lt__ , __mul__ , __ne__ , __new__ , __reduce__ , __reduce_ex__ , __repr__ , __reversed__ , __rmul__ , __setattr__ , __setitem__ , __sizeof__ , __str__ , __subclasshook__ , append , clear , copy , count , extend , index , insert , pop , remove , reverse , sort
'''

help(lst.index)
'''
Help on built-in function index:

index(...) method of builtins.list instance
    L.index(value, [start, [stop]]) -> integer -- return first index of value.
    Raises ValueError if the value is not present.
'''

mystring = "Monty Python !  And the holy Grail ! \n"
print(mystring.split())
'''
['Monty', 'Python', '!', 'And', 'the', 'holy', 'Grail', '!']
'''

print(mystring.strip())
'''Monty Python !  And the holy Grail !'''

print(mystring.lstrip())
'''
Monty Python !  And the holy Grail ! 

 '''

print(mystring.rstrip())
'''Monty Python !  And the holy Grail !'''

print(mystring.upper())
'''
MONTY PYTHON !  AND THE HOLY GRAIL ! 

'''
print(mystring.replace('!', ''''''))
'''
Monty Python   And the holy Grail  

'''
# 1.2.3 正则表达式
import re
if re.search('Python', mystring):
    print("We found python ")
else:
    print("No ")
'''We found python '''

import re
print(re.findall('!', mystring))
'''['!', '!']'''

# 1.2.4 字典
word_freq = {}
for tok in mystring.split():
    if tok in word_freq:
        word_freq[tok] += 1
    else:
        word_freq[tok] = 1
print(word_freq)
'''{'Monty': 1, 'Python': 1, '!': 2, 'And': 1, 'the': 1, 'holy': 1, 'Grail': 1}'''

# 1.2.5 编写函数
import sys
def wordfreq(mystring):
    '''
    Function to generated the frequency distribution of the given text
    '''
    print(mystring)
    word_freq = {}
    for tok in mystring.split():
        if tok in word_freq:
            word_freq[tok] += 1
        else:
            word_freq[tok] = 1
    print(word_freq)

def main():
    str = "This is my fist python program"
    wordfreq(str)
if __name__ == '__main__':
    main()
'''
This is my fist python program
{'This': 1, 'is': 1, 'my': 1, 'fist': 1, 'python': 1, 'program': 1}
'''

# 1.3 向NLTK迈进
from urllib import request
response = request.urlopen('http://python.org/')
html = response.read()
html = html.decode('utf-8')
print(len(html))
'''48141'''
#print(html)

tokens = [tok for tok in html.split()]
print("Total no of tokens :" + str(len(tokens)))
'''Total no of tokens :2901'''
print(tokens[0: 100])
'''
['', 'html>', '', '', '', '', 'class="no-js"', 'lang="en"', 'dir="ltr">', '', '', '', 'charset="utf-8">', '', 'http-equiv="X-UA-Compatible"', 'content="IE=edge">', '<link', 'rel="prefetch"', 'href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">', '', 'name="application-name"', 'content="Python.org">', '', 'name="msapplication-tooltip"', 'content="The', 'official', 'home', 'of', 'the', 'Python', 'Programming', 'Language">', '', 'name="apple-mobile-web-app-title"', 'content="Python.org">', '', 'name="apple-mobile-web-app-capable"', 'content="yes">', '', 'name="apple-mobile-web-app-status-bar-style"', 'content="black">', '', 'name="viewport"', 'content="width=device-width,', 'initial-scale=1.0">', '', 'name="HandheldFriendly"', 'content="True">', '', 'name="format-detection"', 'content="telephone=no">', '', 'http-equiv="cleartype"', 'content="on">', '', 'http-equiv="imagetoolbar"', 'content="false">', '', 'src="/static/js/libs/modernizr.js"></script>', '/stylesheets/style.css"', 'rel="stylesheet"', 'type="text/css"', 'title="default"', '/>', '"', 'rel="stylesheet"', 'type="text/css"', 'media="not', 'print,', 'braille,']
'''

import re
tokens = re.split('\W+', html)
print(len(tokens))
'''6131'''
print(tokens[0: 100])
'''
['', 'doctype', 'html', 'if', 'lt', 'IE', '7', 'html', 'class', 'no', 'js', 'ie6', 'lt', 'ie7', 'lt', 'ie8', 'lt', 'ie9', 'endif', 'if', 'IE', '7', 'html', 'class', 'no', 'js', 'ie7', 'lt', 'ie8', 'lt', 'ie9', 'endif', 'if', 'IE', '8', 'html', 'class', 'no', 'js', 'ie8', 'lt', 'ie9', 'endif', 'if', 'gt', 'IE', '8', 'html', 'class', 'no', 'js', 'lang', 'en', 'dir', 'ltr', 'endif', 'head', 'meta', 'charset', 'utf', '8', 'meta', 'http', 'equiv', 'X', 'UA', 'Compatible', 'content', 'IE', 'edge', 'link', 'rel', 'prefetch', 'href', 'ajax', 'googleapis', 'com', 'ajax', 'libs', 'jquery', '1', '8', '2', 'jquery', 'min', 'js', 'meta', 'name', 'application', 'name', 'content', 'Python', 'org', 'meta', 'name', 'msapplication', 'tooltip', 'content', 'The', 'official']
'''

'''pip3 install bs4 lxml'''
import nltk
from bs4 import BeautifulSoup
#clean = nltk.clean_html(html)
#tokens = [tok for tok in clean.split()]
soup = BeautifulSoup(html, "lxml")
clean = soup.get_text()
tokens = [tok for tok in clean.split()]
print(tokens[:100])
'''
['Welcome', 'to', 'Python.org', '{', '"@context":', '"http://schema.org",', '"@type":', '"WebSite",', '"url":', '"https://www.python.org/",', '"potentialAction":', '{', '"@type":', '"SearchAction",', '"target":', '"https://www.python.org/search/?q={search_term_string}",', '"query-input":', '"required', 'name=search_term_string"', '}', '}', 'var', '_gaq', '=', '_gaq', '||', '[];', "_gaq.push(['_setAccount',", "'UA-39055973-1']);", "_gaq.push(['_trackPageview']);", '(function()', '{', 'var', 'ga', '=', "document.createElement('script');", 'ga.type', '=', "'text/javascript';", 'ga.async', '=', 'true;', 'ga.src', '=', "('https:'", '==', 'document.location.protocol', '?', "'https://ssl'", ':', "'http://www')", '+', "'.google-analytics.com/ga.js';", 'var', 's', '=', "document.getElementsByTagName('script')[0];", 's.parentNode.insertBefore(ga,', 's);', '})();', 'Notice:', 'While', 'Javascript', 'is', 'not', 'essential', 'for', 'this', 'website,', 'your', 'interaction', 'with', 'the', 'content', 'will', 'be', 'limited.', 'Please', 'turn', 'Javascript', 'on', 'for', 'the', 'full', 'experience.', 'Skip', 'to', 'content', '', 'Close', 'Python', 'PSF', 'Docs', 'PyPI', 'Jobs', 'Community', '', 'The', 'Python', 'Network']
'''

import operator
freq_dis = {}
for tok in tokens:
    if tok in freq_dis:
        freq_dis[tok] += 1
    else:
        freq_dis[tok] = 1
sorted_freq_dist = sorted(freq_dis.items(), key = operator.itemgetter(1), reverse = True)
print(sorted_freq_dist[:25])
'''
[('Python', 60), ('>>>', 24), ('and', 22), ('is', 18), ('the', 18), ('to', 17), ('of', 15), ('=', 14), ('Events', 11), ('News', 11), ('a', 10), ('for', 10), ('More', 9), ('#', 9), ('3', 8), ('in', 8), ('Community', 7), ('with', 7), ('...', 7), ('Docs', 6), ('Guide', 6), ('Software', 6), ('now', 5), ('that', 5), ('The', 5)]
'''

import nltk
Freq_dist_nltk = nltk.FreqDist(tokens)
print(Freq_dist_nltk)
'''600 samples and 1105 outcomes>'''
for k, v in Freq_dist_nltk.items():
    print(str(k) + ':' + str(v))
'''
This:1
[fruit.upper():1
Forums:2
Check:1
...
GUI:1
Intuitive:1
X:2
growth:1
advance:1
'''

# below is the plot for the frequency distributions
# 显示累积词频
Freq_dist_nltk.plot(50, cumulative=False)

## 停用词处理
#stopwords = [word.strip().lower() for word in open("PATH/english.stop.txt")]
#clean_tokens=[tok for tok in tokens if len(tok.lower()) > 1 and (tok.lower() not in stopwords)]
#Freq_dist_nltk = nltk.FreqDist(clean_tokens)
#Freq_dist_nltk.plot(50, cumulative = False)

02.02 文本的歧义及其清理(NLTKEssentials02.py)

# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》02 文本的歧义及其清理
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials02.py # 文本的歧义及其清理

# 标识化处理、词干提取、词形还原(lemmatization)、停用词移除

# 2.1 文本歧义
'''
# examples.csv
"test01",99
"test02",999
"test03",998
"test04",997
"test05",996
'''
import csv
with open('examples.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter = ',', quotechar = '"')
    for line in reader:
        print(line[1])
'''
99
999
998
997
996
'''

'''
# examples.json
{
  "array": [1, 2, 3, 4],
  "boolean": true,
  "object": {"a": "b"},
  "string": "Hello, World"
}
'''
import json
jsonfile = open('examples.json')
data = json.load(jsonfile)
print(data['string'])
'''Hello, World'''
with open('examples.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    print(data['string'])
'''Hello, World'''

# 2.2 文本清理

# 2.3 语句分离
import nltk
inputstring = 'This is an examples sent. The sentence splitter will split on sent markers. Ohh really !!'
from nltk.tokenize import sent_tokenize
#all_sent = sent_tokenize(inputstring, language="english")
all_sent = sent_tokenize(inputstring)
print(all_sent)

import nltk.tokenize.punkt
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()


# 2.4分词(标识化处理) toeknize http://text-processing.com/demo
s = "Hi Everyone ! hola gr8"
print(s.split())
from nltk.tokenize import word_tokenize
word_tokenize(s)

from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
regexp_tokenize(s, pattern = '\w+')
regexp_tokenize(s, pattern = '\d+')
wordpunct_tokenize(s)
blankline_tokenize(s)


# 2.5 词干提取(stemming)
# eat eatting eaten eats ==> eat
# 对于中文、日文,词干提取很难实现
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
pst = PorterStemmer()
lst = LancasterStemmer()
print(lst.stem("eating"))
'''eat'''
print(pst.stem("shopping"))
'''shop'''


# 2.6 词形还原(lemmatization),词根(lemma)
from nltk.stem import WordNetLemmatizer
wlem = WordNetLemmatizer()
wlem.lemmatize("ate")
# Resource 'corpora/wordnet.zip/wordnet/' not found.  Please use the NLTK Downloader to obtain the resource:  >>> nltk.download()


# 2.7 停用词移除(Stop word removal)
import nltk
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
text = "This is just a test"
cleanwordlist = [word for word in text.split() if word not in stoplist]
print(cleanwordlist)
'''['This', 'test']'''


# 2.8 罕见词移除
'''
import nltk
token = text.split()
freq_dist = nltk.FreqDist(token)
rarewords = freq_dist.keys()[-50:]
after_rare_words = [word for word in token not in rarewords]
print(after_rare_words)
'''

# 2.9 拼写纠错(speelchecker)
from nltk.metrics import edit_distance
print(edit_distance("rain", "shine")) # 3

02.03 词性标注(NLTKEssentials03.py)

# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》03 词性标注
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials03.py # 词性标注

# 3.1 词性标注
# 词性(POS)
# PennTreebank

import nltk
from nltk import word_tokenize
s = "I was watching TV"
print(nltk.pos_tag(word_tokenize(s)))


tagged = nltk.pos_tag(word_tokenize(s))
allnoun = [word for word, pos in tagged if pos in ['NN', 'NNP']]
print(allnoun)


# 3.1.1 Stanford标注器
# https://nlp.stanford.edu/software/stanford-postagger-full-2017-06-09.zip
from nltk.tag.stanford import StanfordPOSTagger
import nltk
stan_tagger = StanfordPOSTagger('D:/nltk_data/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger',
                                'D:/nltk_data/stanford-postagger-full-2017-06-09/stanford-postagger.jar')
s = "I was watching TV"
tokens = nltk.word_tokenize(s)
stan_tagger.tag(tokens)

# 3.1.2 深入了解标注器
from nltk.corpus import brown
import nltk
tags = [tag for (word, tag) in brown.tagged_words(categories = 'news')]
print(nltk.FreqDist(tags))

brown_tagged_sents = brown.tagged_sents(categories = 'news')
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(brown_tagged_sents))


# 3.1.3 顺序性标注器
# 1 N-Gram标注器
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
biggram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(biggram_tagger.evaluate(test_data))
trigram_tagger = TrigramTagger(train_data, backoff=biggram_tagger)
print(trigram_tagger.evaluate(test_data))

# 2 正则表达式标注器
from nltk.tag.sequential import RegexpTagger
regexp_tagger = RegexpTagger(
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'(The|the|A|a|An|an)$', 'AT'),  # articles
     (r'.*able$', 'JJ'), # adjectives
     (r'.*ness$', 'NN'), # nouns formed from adj
     (r'.*ly$', 'RB'),   # adverbs
     (r'.*s$', 'NNS'),   # plural nouns
     (r'.*ing$', 'VBG'), # gerunds
     (r'.*ed$', 'VBD'),  # past tense verbs
     (r'.*', 'NN')       # nouns (default)
    ])
print(regexp_tagger.evaluate(test_data))

# 3.1.4 Brill 标注器
# 3.1.5 基于机器学习的标注器
# 最大熵分类器(MEC)
# 隐性马尔科夫模型(HMM)
# 条件随机场(CRF)

# 3.2 命名实体识别(NER)
# NER标注器
import nltk
from nltk import ne_chunk
sent = "Mark is studying at Stanford University in California"
print(ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False))

from nltk.tag.stanford import StanfordNERTagger
# https://nlp.stanford.edu/software/stanford-ner-2017-06-09.zip
st = StanfordNERTagger('D:/nltk_data/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'D:/nltk_data/stanford-ner-2017-06-09/stanford-ner.jar')
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

02.04 文本结构解析(NLTKEssentials04.py)

# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》04 文本结构解析
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials04.py # 文本结构解析

# 4.1 浅解析与深解析
# CFG(context-free grammar):上下文无关语法
# PCFG(probabilistic context-free grammar):概率性上下文无关语法
# 浅解析:shallow parsing
# 深解析:deep parsing

# 4.2 两种解析方法
# 基于规则
# 基于概率

# 4.3 为什么需要进行解析
# 语法解析器(syntactic parser)
'''
import nltk
from nltk import CFG
toy_grammar = nltk.CFG.fromstring(
"""
S -> NP VP  # S indicate the entire sentence
VP -> V NP  # VP is verb phrase the
V -> "eats" | "drinks" # V is verb
NP -> Det N # NP is noun phrase (chunk that has noun in it)
Det -> "a" | "an" | "the" # Det is determiner used in the sentences
N -> "president" | "Obama" | "apple" | "coke" # N some example nouns
""")
toy_grammar.productions()
'''

# 4.4 不同的解析器类型
# 4.4.1 递归下降解析器
# 4.4.2 移位-规约解析器
# 4.4.3 图表解析器
# 4.4.4 正则表达式解析器
import nltk
from nltk.chunk.regexp import *
chunk_rules = ChunkRule("<.*>+", "chunk everything")
reg_parser = RegexpParser('''
NP: {
? * *} # NP P: {} # Preposition V: {} # Verb PP: {

} # PP -> P NP VP: { *} # VP -> V (NP|PP)* ''') test_sent = "Mr. Obama played a big role in the Health insurance bill" test_sent_pos = nltk.pos_tag(nltk.word_tokenize(test_sent)) paresed_out = reg_parser.parse(test_sent_pos) print(paresed_out) # 4.5 依存性文本解析(dependency parsing, DP) # 基于概率的投射依存性解析器(probabilistic, projective dependency parser) from nltk.parse.stanford import StanfordParser # https://nlp.stanford.edu/software/stanford-parser-full-2017-06-09.zip english_parser = StanfordParser('D:/nltk_data/stanford-parser-full-2017-06-09/stanford-parser.jar', 'D:/nltk_data/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar') english_parser.raw_parse_sents(("this is the english parser test")) # 4.6 语块解析 ''' from nltk.chunk.regexp import * test_sent = "The prime minister announced he had asked the chief government whip, \ Philip Ruddock, to call a special party room meeting for 9am on Monday to consider the spill motion." test_sent_pos = nltk.pos_tag(nltk.word_tokenize(test_sent)) rule_vp = ChunkRule(r'()?()+()?', 'Chunk VPs') parser_vp = RegexpChunkParser([rule_vp], chunk_label = 'VP') print(parser_vp.parse(test_sent_pos)) rule_np = ChunkRule(r'(

??)?*(<,>)*()+', 'Chunk NPs') parser_np = RegexpChunkParser([rule_np], chunk_label="NP") print(parser_np.parse(test_sent_pos)) ''' # 4.7 信息提取 # 4.7.1 命名实体识别(NER) f = open("D:/nltk_data/ner_sample.txt")# absolute path for the file of text for which we want NER text = f.read() sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] for sent in tagged_sentences: print(nltk.ne_chunk(sent)) # 4.7.2 关系提取 import re IN = re.compile(r'.*\bin\b(?!\b.+ing)') for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'): for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus = 'ieer', pattern = IN): print(nltk.sem.rtuple(rel))

02.06 文本分类(NLTKEssentials06.py)

# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》06 文本分类
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials06.py # 文本分类

# 6.2 文本分类
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
def preprocessing(text):
    #text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
sms = open('D:/nltk_data/SMSSpamCollection', encoding='utf8') # check the structure of this file!
sms_data = []
sms_labels = []
csv_reader = csv.reader(sms, delimiter = '\t')
for line in csv_reader:
    # adding the sms_id
    sms_labels.append(line[0])
    # adding the cleaned text We are calling preprocessing method
    sms_data.append(preprocessing(line[1]))
sms.close()

# 6.3 采样操作
import sklearn
import numpy as np
trainset_size = int(round(len(sms_data)*0.70))
# i chose this threshold for 70:30 train and test split.
print('The training set size for this classifier is ' + str(trainset_size) + '\n')
x_train = np.array([''.join(el) for el in sms_data[0: trainset_size]])
y_train = np.array([el for el in sms_labels[0: trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
         #or el in sms_labels[trainset_size+1:len(sms_labels)]

print(x_train)
print(y_train)

from sklearn.feature_extraction.text import CountVectorizer
sms_exp = []
for line in sms_data:
    sms_exp.append(preprocessing(line))
vectorizer = CountVectorizer(min_df = 1, encoding='utf-8')
X_exp = vectorizer.fit_transform(sms_exp)
print("||".join(vectorizer.get_feature_names()))
print(X_exp.toarray())

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 2, ngram_range=(1, 2),
                             stop_words = 'english', strip_accents = 'unicode', norm = 'l2')
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)

# 6.3.1 朴素贝叶斯法
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
clf = MultinomialNB().fit(X_train, y_train)
y_nb_predicted = clf.predict(X_test)
print(y_nb_predicted)
print('\n confusion_matrix \n')
#cm = confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_nb_predicted)
print(cm)
print('\n Here is the classification report:')
print(classification_report(y_test, y_nb_predicted))

feature_names = vectorizer.get_feature_names()
coefs = clf.coef_
intercept = clf.intercept_
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
n = 10
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
    print('\t%.4f\t%-15s\t\t%.4f\t%-15s' %(coef_1, fn_1, coef_2, fn_2))

# 6.3.2 决策树
from sklearn import tree
clf = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_tree_predicted = clf.predict(X_test.toarray())
print(y_tree_predicted)
print('\n Here is the classification report:')
print(classification_report(y_test, y_tree_predicted))

# 6.3.3 随机梯度下降法
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
clf = SGDClassifier(alpha = 0.0001, n_iter=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, y_pred))
print(' \n confusion_matrix \n')
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 6.3.4 逻辑回归
# 6.3.5 支持向量机
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC().fit(X_train, y_train)
y_svm_predicted = svm_classifier.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, y_svm_predicted))
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 6.4 随机森林
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
predicted = RF_clf.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, predicted))
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 6.5 文本聚类
# K 均值法
from sklearn.cluster import KMeans, MiniBatchKMeans
from collections import defaultdict
true_k = 5
km = KMeans(n_clusters = true_k, init='k-means++', max_iter=100, n_init= 1)
kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=2)
km_model = km.fit(X_train)
kmini_model = kmini.fit(X_train)
print("For K-mean clustering ")
clustering = defaultdict(list)
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)
print("For K-mean Mini batch clustering ")
clustering = defaultdict(list)
for idx, label in enumerate(kmini_model.labels_):
    clustering[label].append(idx)

# 6.6 文本中的主题建模
# https://pypi.python.org/pypi/gensim#downloads
import gensim
from gensim import corpora, models, similarities
from itertools import chain
import nltk
from nltk.corpus import stopwords
from operator import itemgetter
import re
documents = [document for document in sms_data]
stoplist = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
print(texts)


dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics = 100)
# print(lsi.print_topics(20))
n_topics = 5
lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = n_topics)
for i in range(0, n_topics):
    temp = lda.show_topic(i, 10)
    terms = []
    for term in temp:
        terms.append(str(term[0]))
    print("Top 10 terms for topic #" + str(i) + ": " + ",".join(terms))

02.08 NLTK与其他Python库的搭配运用(NLTKEssentials08.py)

# 《NLTK基础教程--用NLTK和Python库构建机器学习应用》08 NLTK与其他Python库的搭配使用
# win10 nltk3.2.4 python3.5.3/python3.6.1
# filename:NLTKEssentials08.py # NLTK与其他Python库的搭配使用

# 8.1 numpy
# 8.1.1 多维数组
import numpy as np
x = [1, 2, 5, 7, 3, 11, 14, 25]
np_arr = np.array(x)
print(np_arr)
'''[ 1  2  5  7  3 11 14 25]'''

arr = [[1, 2], [13, 4], [33, 78]]
np_2darr = np.array(arr)
print(type(np_2darr))
''''''

# 索引操作
print(np_2darr.tolist())
print(np_2darr[:])
'''
[[ 1  2]
 [13  4]
 [33 78]]
'''
print(np_2darr[:2])
'''
[[ 1  2]
 [13  4]]
 '''
print(np_2darr[:1])
'''[[1 2]]'''
print(np_2darr[2])
'''[33 78]'''
print(np_2darr[2][0])
'''33'''
print(np_2darr[:-1])
'''
[[ 1  2]
 [13  4]]
'''

# 8.1.2 基本运算
# import numpy as np
print(np.arange(0.0, 1.0, 0.1))
'''[ 0.   0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9]'''
print(np.ones([2, 4]))
'''
[[ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]]
 '''
print(np.zeros([3, 4]))
'''
[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
 '''
print(np.linspace(0, 2, 10))
'''
[ 0.          0.22222222  0.44444444  0.66666667  0.88888889  1.11111111
  1.33333333  1.55555556  1.77777778  2.        ]
'''
print(np.logspace(0, 1))
'''
[  1.           1.04811313   1.09854114   1.1513954    1.20679264
   1.26485522   1.32571137   1.38949549   1.45634848   1.52641797
   1.59985872   1.67683294   1.75751062   1.84206997   1.93069773
   2.02358965   2.12095089   2.22299648   2.32995181   2.44205309
   2.55954792   2.6826958    2.8117687    2.9470517    3.0888436
   3.23745754   3.39322177   3.55648031   3.72759372   3.90693994
   4.09491506   4.29193426   4.49843267   4.71486636   4.94171336
   5.17947468   5.42867544   5.68986603   5.96362332   6.25055193
   6.55128557   6.86648845   7.19685673   7.54312006   7.90604321
   8.28642773   8.68511374   9.10298178   9.54095476  10.        ]
'''
help(np.logspace)
'''
Help on function logspace in module numpy.core.function_base:

logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None)
    Return numbers spaced evenly on a log scale.

    In linear space, the sequence starts at ``base ** start``
    (`base` to the power of `start`) and ends with ``base ** stop``
    (see `endpoint` below).

    Parameters
    ----------
    start : float
        ``base ** start`` is the starting value of the sequence.
    stop : float
        ``base ** stop`` is the final value of the sequence, unless `endpoint`
        is False.  In that case, ``num + 1`` values are spaced over the
        interval in log-space, of which all but the last (a sequence of
        length `num`) are returned.
    num : integer, optional
        Number of samples to generate.  Default is 50.
    endpoint : boolean, optional
        If true, `stop` is the last sample. Otherwise, it is not included.
        Default is True.
    base : float, optional
        The base of the log space. The step size between the elements in
        ``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform.
        Default is 10.0.
    dtype : dtype
        The type of the output array.  If `dtype` is not given, infer the data
        type from the other input arguments.

    Returns
    -------
    samples : ndarray
        `num` samples, equally spaced on a log scale.

    See Also
    --------
    arange : Similar to linspace, with the step size specified instead of the
             number of samples. Note that, when used with a float endpoint, the
             endpoint may or may not be included.
    linspace : Similar to logspace, but with the samples uniformly distributed
               in linear space, instead of log space.
    geomspace : Similar to logspace, but with endpoints specified directly.

    Notes
    -----
    Logspace is equivalent to the code

    >>> y = np.linspace(start, stop, num=num, endpoint=endpoint)
    ... # doctest: +SKIP
    >>> power(base, y).astype(dtype)
    ... # doctest: +SKIP

    Examples
    --------
    >>> np.logspace(2.0, 3.0, num=4)
    array([  100.        ,   215.443469  ,   464.15888336,  1000.        ])
    >>> np.logspace(2.0, 3.0, num=4, endpoint=False)
    array([ 100.        ,  177.827941  ,  316.22776602,  562.34132519])
    >>> np.logspace(2.0, 3.0, num=4, base=2.0)
    array([ 4.        ,  5.0396842 ,  6.34960421,  8.        ])

    Graphical illustration:

    >>> import matplotlib.pyplot as plt
    >>> N = 10
    >>> x1 = np.logspace(0.1, 1, N, endpoint=True)
    >>> x2 = np.logspace(0.1, 1, N, endpoint=False)
    >>> y = np.zeros(N)
    >>> plt.plot(x1, y, 'o')
    []
    >>> plt.plot(x2, y + 0.5, 'o')
    []
    >>> plt.ylim([-0.5, 1])
    (-0.5, 1)
    >>> plt.show()
'''
# 8.1.3 从数组中提取数据
A = np.array([[0, 0, 0], [0, 1, 2], [0, 2, 4], [0, 3, 6]])
B = np.array([n for n in range(4)])
print(B)
'''[0 1 2 3]'''
less_than_3 = B < 3
print(less_than_3)
'''[ True  True  True False]'''
print(B[less_than_3])
'''[0 1 2]'''
B[less_than_3] = 0
print(B)
'''[0 0 0 3]'''
print(np.diag(A))
'''[0 1 4]'''

# 8.1.4 复杂矩阵运算
A = np.array([[1, 2], [3, 4]])
print(A * A) #
'''
[[ 1  4]
 [ 9 16]]
'''
print(np.dot(A, A)) # 点积
'''
[[ 7 10]
 [15 22]]
'''
print(A - A)
'''
[[0 0]
 [0 0]]
'''
print(A + A)
'''
[[2 4]
 [6 8]]
'''
print(np.transpose(A))
'''
[[1 3]
 [2 4]]
'''
print(np.transpose(A, axes = [0, 1]))
'''
[[1 2]
 [3 4]]
'''
print(A.T)
'''
[[1 3]
 [2 4]]
'''
M = np.matrix(A)
print(M)
'''
[[1 2]
 [3 4]]
'''
print(np.invert(M))
'''
[[-2 -3]
 [-4 -5]]
'''
N = np.random.randn(1, 10)
print(N)
'''
[[-0.08839128  1.25979204 -0.46311213 -0.27113081  0.85757258 -1.28109429
  -1.00875299  0.10666042 -0.49751293  0.81362605]]
'''
# 1 重塑和堆叠
print(A)
'''
[[1 2]
 [3 4]]
'''
(r, c) = A.shape
print(r, c)
'''2 2'''
print(A.reshape((1, r * c)))
'''[[1 2 3 4]]'''
print(np.repeat(A, 2))
'''[1 1 2 2 3 3 4 4]'''
print(A)
'''
[[1 2]
 [3 4]]
'''
print(np.tile(A, 4))
'''
[[1 2 1 2 1 2 1 2]
 [3 4 3 4 3 4 3 4]]
'''
B = np.array([[5, 6]])
print(np.concatenate((A, B), axis = 0))
'''
[[1 2]
 [3 4]
 [5 6]]
'''
print(np.vstack((A, B)))
'''
[[1 2]
 [3 4]
 [5 6]]
'''
print(np.concatenate((A, B.T), axis = 1))
'''
[[1 2 5]
 [3 4 6]]
'''
# 2 随机数
from numpy import random
# uniform random number from [0, 1]
print(random.rand(2, 5))
'''
[[ 0.15398327  0.88990373  0.99180579  0.89229317  0.40380238]
 [ 0.10244161  0.16451004  0.45110841  0.3621777   0.4680435 ]]
'''
print(random.randn(2, 5))
'''
[[ 0.87847643 -0.87712286  0.75692718  1.43164752  0.26695439]
 [ 1.45320364 -0.7812028  -0.17562589  1.72513472  1.35974398]]
'''

# 8.2 SciPy
import scipy as sp
from scipy.integrate import quad, dblquad, tplquad
def f(x):
    return x
x_lower = 0 # the lower limit of x
x_upper = 1 # the upper limit of x
val, abserr = quad(f, x_lower, x_upper)
print(val, abserr)
'''0.5 5.551115123125783e-15'''
# 插值运算 scipy.interpolate
# 傅里叶变换 scipy.fftpack
# 信号处理 scipy.signal
# 8.2.1 线性代数
A = sp.rand(2, 2)
B = sp.rand(2, 2)
from scipy import linalg as LA
X = LA.solve(A, B)
print(X)
'''
[[ 0.21226312  1.92812885]
 [ 0.54343623 -0.08202333]]
'''
print(A.dot(B))
'''
[[ 0.41041687  0.6001985 ]
 [ 0.46383677  0.79950073]]
'''
# 8.2.2 特征值与特征向量
evals = LA.eigvals(A)
print(evals)
'''[-0.00542105+0.j  0.45753295+0.j]'''
evals, evect = LA.eig(A)
print((evals, evect))
'''
(array([ 1.09776801+0.j,  0.19939128+0.j]), array([[ 0.56486092, -0.35585864],
       [ 0.82518613,  0.93453979]]))
'''
print(LA.eig(A))
'''
(array([ 1.52391308+0.j,  0.29130459+0.j]), array([[ 0.62099076, -0.54159873],
       [ 0.78381789,  0.84063715]]))
'''
print(LA.inv(A))
'''[[-0.28075038  3.9631977 ]
 [ 1.58581322 -2.69374912]]
'''
# 8.2.3 稀疏矩阵
from scipy import sparse as s
A = np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]])
print(A)
'''
[[1 0 0]
 [0 2 0]
 [0 0 3]]
'''
C = s.csr_matrix(A)
print(C)
'''
  (0, 0)    1
  (1, 1)    2
  (2, 2)    3
'''
print(C.toarray())
'''
[[1 0 0]
 [0 2 0]
 [0 0 3]]
'''
print(C * C.todense())
'''
[[1 0 0]
 [0 4 0]
 [0 0 9]]
'''
print(sp.dot(C, C).todense())
'''
[[1 0 0]
 [0 4 0]
 [0 0 9]]
'''
# 8.2.4 优化措施
def f(x):
    return x**2 - 4
sp.optimize.fmin_bfgs(f, 0)
'''
Optimization terminated successfully.
         Current function value: -4.000000
         Iterations: 0
         Function evaluations: 3
         Gradient evaluations: 1
'''
help(sp.optimize.fmin_bfgs)
'''
Help on function fmin_bfgs in module scipy.optimize.optimize:

fmin_bfgs(f, x0, fprime=None, args=(), gtol=1e-05, norm=inf, epsilon=1.4901161193847656e-08, maxiter=None, full_output=0, disp=1, retall=0, callback=None)
    Minimize a function using the BFGS algorithm.

    Parameters
    ----------
    f : callable f(x,*args)
        Objective function to be minimized.
    x0 : ndarray
        Initial guess.
    fprime : callable f'(x,*args), optional
        Gradient of f.
    args : tuple, optional
        Extra arguments passed to f and fprime.
    gtol : float, optional
        Gradient norm must be less than gtol before successful termination.
    norm : float, optional
        Order of norm (Inf is max, -Inf is min)
    epsilon : int or ndarray, optional
        If fprime is approximated, use this value for the step size.
    callback : callable, optional
        An optional user-supplied function to call after each
        iteration.  Called as callback(xk), where xk is the
        current parameter vector.
    maxiter : int, optional
        Maximum number of iterations to perform.
    full_output : bool, optional
        If True,return fopt, func_calls, grad_calls, and warnflag
        in addition to xopt.
    disp : bool, optional
        Print convergence message if True.
    retall : bool, optional
        Return a list of results at each iteration if True.

    Returns
    -------
    xopt : ndarray
        Parameters which minimize f, i.e. f(xopt) == fopt.
    fopt : float
        Minimum value.
    gopt : ndarray
        Value of gradient at minimum, f'(xopt), which should be near 0.
    Bopt : ndarray
        Value of 1/f''(xopt), i.e. the inverse hessian matrix.
    func_calls : int
        Number of function_calls made.
    grad_calls : int
        Number of gradient calls made.
    warnflag : integer
        1 : Maximum number of iterations exceeded.
        2 : Gradient and/or function calls not changing.
    allvecs  :  list
        `OptimizeResult` at each iteration.  Only returned if retall is True.

    See also
    --------
    minimize: Interface to minimization algorithms for multivariate
        functions. See the 'BFGS' `method` in particular.

    Notes
    -----
    Optimize the function, f, whose gradient is given by fprime
    using the quasi-Newton method of Broyden, Fletcher, Goldfarb,
    and Shanno (BFGS)

    References
    ----------
    Wright, and Nocedal 'Numerical Optimization', 1999, pg. 198.
'''
print(sp.optimize.fsolve(f, 0.2))
'''[ 2.]'''

# 坏的测试
def f1(x):
    return [x[0] ** 2 + x[1]**2 + 4, x[0] ** 2 + x[1]**2 - 4]
print(sp.optimize.fsolve(f1, [1, 1]))
'''
[ 0.02449328 -0.00592522]
C:\Python36\lib\site-packages\scipy\optimize\minpack.py:161: RuntimeWarning: The iteration is not making good progress, as measured by the 
  improvement from the last ten iterations.
  warnings.warn(msg, RuntimeWarning)
'''

# 8.3 pandas
import pandas as pd
# https://archive.ics.uci.edu/ml/machine-learning-databases/iris/
data = pd.read_csv("./iris/iris.data", header = 0)
print(data.head())
'''
   5.1  3.5  1.4  0.2  Iris-setosa
0  4.9  3.0  1.4  0.2  Iris-setosa
1  4.7  3.2  1.3  0.2  Iris-setosa
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
'''

data = pd.read_csv("./iris/iris.data", names = ["sepal length", "sepal width", "petal length", "petal width", "cat"], header = None)
print(data.head())
'''
   sepal length  sepal width  petal length  petal width          cat
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
'''
sepal_len_cnt = data['sepal length'].value_counts()
print(sepal_len_cnt)
'''
5.0    10
6.3     9
5.1     9
6.7     8
5.7     8
5.5     7
5.8     7
6.4     7
6.0     6
4.9     6
6.1     6
5.4     6
5.6     6
6.5     5
4.8     5
7.7     4
6.9     4
5.2     4
6.2     4
4.6     4
7.2     3
6.8     3
4.4     3
5.9     3
6.6     2
4.7     2
7.6     1
7.4     1
4.3     1
7.9     1
7.3     1
7.0     1
4.5     1
5.3     1
7.1     1
Name: sepal length, dtype: int64
'''
print(data['cat'].value_counts())
'''
Name: sepal length, dtype: int64
Iris-versicolor    50
Iris-setosa        50
Iris-virginica     50
Name: cat, dtype: int64
'''

# 8.3.2 数列
# http://archive.ics.uci.edu/ml/machine-learning-databases/00312/
stockdata = pd.read_csv("./data/dow_jones_index.data", parse_dates = ['date'], index_col = ['date'], nrows = 100)
print(stockdata.head())
'''
            quarter stock    open    high     low   close     volume  \
date                                                                   
2011-01-07        1    AA  $15.82  $16.72  $15.78  $16.42  239655616   
2011-01-14        1    AA  $16.71  $16.71  $15.64  $15.97  242963398   
2011-01-21        1    AA  $16.19  $16.38  $15.60  $15.79  138428495   
2011-01-28        1    AA  $15.87  $16.63  $15.82  $16.13  151379173   
2011-02-04        1    AA  $16.18  $17.39  $16.18  $17.14  154387761   

            percent_change_price  percent_change_volume_over_last_wk  \
date                                                                   
2011-01-07               3.79267                                 NaN   
2011-01-14              -4.42849                            1.380223   
2011-01-21              -2.47066                          -43.024959   
2011-01-28               1.63831                            9.355500   
2011-02-04               5.93325                            1.987452   

            previous_weeks_volume next_weeks_open next_weeks_close  \
date                                                                 
2011-01-07                    NaN          $16.71           $15.97   
2011-01-14            239655616.0          $16.19           $15.79   
2011-01-21            242963398.0          $15.87           $16.13   
2011-01-28            138428495.0          $16.18           $17.14   
2011-02-04            151379173.0          $17.33           $17.37   

            percent_change_next_weeks_price  days_to_next_dividend  \
date                                                                 
2011-01-07                        -4.428490                     26   
2011-01-14                        -2.470660                     19   
2011-01-21                         1.638310                     12   
2011-01-28                         5.933250                      5   
2011-02-04                         0.230814                     97   

            percent_return_next_dividend  
date                                      
2011-01-07                      0.182704  
2011-01-14                      0.187852  
2011-01-21                      0.189994  
2011-01-28                      0.185989  
2011-02-04                      0.175029  
'''
print(max(stockdata['volume']))
'''1453438639'''
print(max(stockdata['percent_change_price']))
'''7.62174'''
print(stockdata.index)
'''
DatetimeIndex(['2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
               '2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
               '2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
               '2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
               '2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
               '2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
               '2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
               '2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
               '2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
               '2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
               '2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
               '2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
               '2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
               '2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
               '2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
               '2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
               '2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
               '2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
               '2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
               '2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
               '2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
               '2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28',
               '2011-02-04', '2011-02-11', '2011-02-18', '2011-02-25',
               '2011-03-04', '2011-03-11', '2011-03-18', '2011-03-25',
               '2011-01-07', '2011-01-14', '2011-01-21', '2011-01-28'],
              dtype='datetime64[ns]', name='date', freq=None)
'''
print(stockdata.index.day)
'''
Int64Index([ 7, 14, 21, 28,  4, 11, 18, 25,  4, 11, 18, 25,  7, 14, 21, 28,  4,
            11, 18, 25,  4, 11, 18, 25,  7, 14, 21, 28,  4, 11, 18, 25,  4, 11,
            18, 25,  7, 14, 21, 28,  4, 11, 18, 25,  4, 11, 18, 25,  7, 14, 21,
            28,  4, 11, 18, 25,  4, 11, 18, 25,  7, 14, 21, 28,  4, 11, 18, 25,
             4, 11, 18, 25,  7, 14, 21, 28,  4, 11, 18, 25,  4, 11, 18, 25,  7,
            14, 21, 28,  4, 11, 18, 25,  4, 11, 18, 25,  7, 14, 21, 28],
           dtype='int64', name='date')
'''
print(stockdata.index.month)
'''
Int64Index([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3,
            3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2,
            3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2,
            2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1,
            2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1],
           dtype='int64', name='date')
'''
print(stockdata.index.year)
'''
Int64Index([2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
            2011],
           dtype='int64', name='date')
'''
print(stockdata.resample('M').apply(np.sum))
'''
            quarter      volume  percent_change_price  \
date                                                    
2011-01-31       36  6779916771             19.637287   
2011-02-28       32  5713027799             28.553732   
2011-03-31       32  5535580114             -7.317345   

            percent_change_volume_over_last_wk  previous_weeks_volume  \
date                                                                    
2011-01-31                          165.675299           5.057285e+09   
2011-02-28                          279.247846           6.077730e+09   
2011-03-31                          -23.774935           5.596445e+09   

            percent_change_next_weeks_price  days_to_next_dividend  \
date                                                                 
2011-01-31                        34.302458                   2618   
2011-02-28                        -4.583387                   1637   
2011-03-31                         3.263918                   1560   

            percent_return_next_dividend  
date                                      
2011-01-31                     18.519712  
2011-02-28                     13.819996  
2011-03-31                     13.930990  
'''
# 8.3.3 列转换
# 删除指定列
stockdata.drop(["percent_change_volume_over_last_wk"], axis = 1)
stockdata_new = pd.DataFrame(stockdata, columns = ['stock', 'open', 'high', "low", "close", "volume"])
print(stockdata_new.head())
'''
           stock    open    high     low   close     volume
date                                                       
2011-01-07    AA  $15.82  $16.72  $15.78  $16.42  239655616
2011-01-14    AA  $16.71  $16.71  $15.64  $15.97  242963398
2011-01-21    AA  $16.19  $16.38  $15.60  $15.79  138428495
2011-01-28    AA  $15.87  $16.63  $15.82  $16.13  151379173
2011-02-04    AA  $16.18  $17.39  $16.18  $17.14  154387761
'''
stockdata["previous_weeks_volume"] = 0
# 8.3.4 噪声数据
#print(stockdata.head())
print(stockdata.dropna().head(2))
'''
            quarter stock    open    high     low   close     volume  \
date                                                                   
2011-01-14        1    AA  $16.71  $16.71  $15.64  $15.97  242963398   
2011-01-21        1    AA  $16.19  $16.38  $15.60  $15.79  138428495   

            percent_change_price  percent_change_volume_over_last_wk  \
date                                                                   
2011-01-14              -4.42849                            1.380223   
2011-01-21              -2.47066                          -43.024959   

            previous_weeks_volume next_weeks_open next_weeks_close  \
date                                                                 
2011-01-14                      0          $16.19           $15.79   
2011-01-21                      0          $15.87           $16.13   

            percent_change_next_weeks_price  days_to_next_dividend  \
date                                                                 
2011-01-14                         -2.47066                     19   
2011-01-21                          1.63831                     12   

            percent_return_next_dividend  
date                                      
2011-01-14                      0.187852  
2011-01-21                      0.189994  
'''
print(stockdata_new.open.describe())
'''
count        100
unique        99
top       $43.86
freq           2
Name: open, dtype: object
'''
stockdata_new.open = pd.to_numeric(stockdata_new.open.str.replace('$', ''))
stockdata_new.close = pd.to_numeric(stockdata_new.close.str.replace('$', ''))
print(stockdata_new.open.describe())
'''
count    100.000000
mean      51.286800
std       32.154889
min       13.710000
25%       17.705000
50%       46.040000
75%       72.527500
max      106.900000
Name: open, dtype: float64
'''
stockdata_new['newopen'] = stockdata_new.open.apply(lambda x: 0.8*x)
print(stockdata_new.newopen.head(5))
'''
date
2011-01-07    12.656
2011-01-14    13.368
2011-01-21    12.952
2011-01-28    12.696
2011-02-04    12.944
Name: newopen, dtype: float64
'''
stockAA = stockdata_new.query('stock=="AA"')
print(stockAA.head())
'''
           stock   open    high     low  close     volume  newopen
date                                                              
2011-01-07    AA  15.82  $16.72  $15.78  16.42  239655616   12.656
2011-01-14    AA  16.71  $16.71  $15.64  15.97  242963398   13.368
2011-01-21    AA  16.19  $16.38  $15.60  15.79  138428495   12.952
2011-01-28    AA  15.87  $16.63  $15.82  16.13  151379173   12.696
2011-02-04    AA  16.18  $17.39  $16.18  17.14  154387761   12.944
'''
# 8.4 matplotlib
import matplotlib.pyplot as plt
from matplotlib import figure
stockCSCO = stockdata_new.query('stock=="CSCO"')
stockCSCO.head()
plt.figure()
plt.scatter(stockdata_new.index.date, stockdata_new.volume)
plt.xlabel('day')
plt.ylabel('stock close value')
plt.title('title')
plt.savefig("nltkplot01.png")

# 8.4.1 子图绘制
plt.subplot(2, 2, 1)
plt.plot(stockAA.index.weekofyear, stockAA.open, 'r--')
plt.subplot(2, 2, 2)
plt.plot(stockCSCO.index.weekofyear, stockCSCO.open, 'g-*')
plt.subplot(2, 2, 3)
plt.plot(stockAA.index.weekofyear, stockAA.open, 'g--')
plt.subplot(2, 2, 4)
plt.plot(stockCSCO.index.weekofyear, stockCSCO.open, 'r-*')
plt.savefig('nltkplot02.png')

# bad test
x = [1, 3, 4, 5, 8, 14]
y = [0, 2, 4, 7, 9, 19]
fig, axes = plt.subplots(nrows = 1, ncols = 2)
for ax in axes:
    ax.plot(x, y, 'r')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('title')
#plt.show(ax)
plt.savefig("nltkplot03.png")

# 8.4.2 添加坐标轴
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
axes.plot(x, y, 'r')
#plt.show(axes)
plt.savefig("nltkplot04.png")

fig = plt.figure()
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
ax.plot(stockAA.index.weekofyear, stockAA.open, label="AA")
ax.plot(stockAA.index.weekofyear, stockCSCO.open, label="CSCO")
ax.set_xlabel("weekofyear")
ax.set_ylabel("stock value")
ax.set_title('Weekly change in stock price')
ax.legend(loc = 2)
plt.savefig("nltkplot05.png")

# 8.4.3 绘制散点图
plt.scatter(stockAA.index.weekofyear, stockAA.open)
plt.savefig("nltkplot06.png")

# 8.4.4 绘制条形图
n = 12
X = np.arange(n)
Y1 = np.random.uniform(0.5, 1.0, n)
Y2 = np.random.uniform(0.5, 1.0, n)
plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white')
plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white')
plt.savefig("nltkplot07.png")

# 8.4.5 3D绘图
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = Axes3D(fig)
X = np.arange(-4, 4, 0.25)
Y = np.arange(-4, 4, 0.25)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2, Y**2)
Z = np.sin(R)
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='hot')
#plt.show(ax)
plt.savefig("nltkplot08.png")

你可能感兴趣的:(NLTK)