s="We tend to imagine that lightning looks like the zigzag you find in the emoji. It’s rarely that simple"
dic={}
for word in s.split():
if word notin dic:
dic[word]=1else:
dic[word]+=1
print(dic)
import numpy as np
import matplotlib.pyplot as plt
fill_data=lambda x : int(x.strip() or0)
data=np.genfromtxt("president.txt",dtype=(int,int),converters={1:fill_data},delimiter=",")
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from collections import OrderedDict
from matplotlib.pylab import frange
2.装在数据,并处理丢失的数据
fill_data=lambda x : int(x.strip() or0)
data=np.genfromtxt("president.txt",dtype=(int,int),converters={1:fill_data},delimiter=",")
x=data[:,0]
y=data[:,1]
x_group = OrderedDict()
group= 5
group_count=1
keys = []
values = []
for i,xx in enumerate(x):
# Individual data point is appended to list keys
keys.append(xx)
values.append(y[i])
# If we have processed five data points (i.e. five years)if group_count == group:
# Convert the list of keys to a tuple# use the new tuple as the ke to x_group dictionary
x_group[tuple(keys)] = values
keys= []
values =[]
group_count = 1
group_count+=1# Accomodate the last batch of keys and values
x_group[tuple(keys)] = values
%matplotlib inline
plt.subplots_adjust(wspace=0.5,hspace=1)
for colpair in col_pairs:
plt.subplot(subplots)
plt.scatter(x[:,colpair[0]],x[:,col_pair[1]],c=y)
plt.xlabel(col_name[col_pair[0]])
plt.ylabel(col_name[col_pair[1]])
subplots+=1
plt.show()
使用热图
热图是另一种有趣的可视化技术。在热图汇总数据的形式是矩阵,数据的属性值范围用颜色渐变来表示。
1.加载库和数据
from sklearn.datasets import load_iris
from sklearn.preprocessing import scale
import numpy as np
import matplotlib.pyplot as plt
data = load_iris()
x = data['data']
y = data['target']
col_names = data['feature_names']
#数据离差,计算并显示幅度值
print("col_name,max min range")
for i,col_nmae in enumerate(col_names):
print("%s,%0.2f,%0.2f,%0.2f" %(col_name,max(x[:,i]),min(x[:,i]),max(x[:,i])-min(x[:,i])))
col_name,max min range
petal width (cm),7.90,4.30,3.60
petal width (cm),4.40,2.00,2.40
petal width (cm),6.90,1.00,5.90
petal width (cm),2.50,0.10,2.40
#计算数据离差,方差和标准差
print("col_name,variance,std_dev")
for i,col_name in enumerate(col_names):
print("%s,%0.2f,%0.2f" %(col_name,np.mean(x[:,i]),np.std(x[:,i])))
#计算平均绝对离差defmad(x,axis=None):
mean=np.mean(x,axis=axis)
return np.sum(np.abs(x-mean))/(1.0*len(x))
print("col_name,mad")
for i, col_name in enumerate(col_names):
print("%s,%0.2f" %(col_name,mad(x[:,i])))
#创建缩放数据import numpy as np
np.random.seed(100)
x=[np.random.randint(10,25)*1.0for i in range(10)]
#定义函数进行缩放defmin_max(x):return [round((xx-min(x))/(max(x)-min(x)),2) for xx in x]
x,min_max(x)
X=x−mean(value)sd(X) X = x − m e a n ( v a l u e ) s d ( X )
#加载包import numpy as np
from sklearn.preprocessing import scale
##创建数据
np.random.seed(100)
x=[np.random.randint(10,25)*1.0for i in range(10)]
##中心化
x_centered=scale(x,with_mean=True,with_std=False)
##标准化
x_standared=scale(x,with_mean=True,with_std=True)
x_centered,x_standared
# 加载所需要的库from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from collections import defaultdict
## 创建一个文本
sentence = "Peter Piper picked a peck of pickled peppers. A peck of pickled \
peppers, Peter Piper picked !!! If Peter Piper picked a peck of pickled \
peppers, Wheres the peck of pickled peppers Peter Piper picked ?"## 使用nltk分词器,将给定的文本分词成多个句子
sent_list=sent_tokenize(sentence)
## 从句子中抽取词
word_dict=defaultdict(list)
for i,sent in enumerate(sent_list):
word_dict[i].extend(word_tokenize(sent))
print(word_dict)r', 'Piper', 'picked', 'a', 'peck', 'of
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
text = "Text mining, also referred to as text data mining, roughly equivalent to text analytics,\
refers to the process of deriving high-quality information from text. High-quality information is \
typically derived through the devising of patterns and trends through means such as statistical \
pattern learning. Text mining usually involves the process of structuring the input text \
(usually parsing, along with the addition of some derived linguistic features and the removal \
of others, and subsequent insertion into a database), deriving patterns within the structured data, \
and finally evaluation and interpretation of the output. 'High quality' in text mining usually \
refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks \
include text categorization, text clustering, concept/entity extraction, production of granular \
taxonomies, sentiment analysis, document summarization, and entity relation modeling \
(i.e., learning relations between named entities).Text analysis involves information retrieval, \
lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, \
information extraction, data mining techniques including link and association analysis, \
visualization, and predictive analytics. The overarching goal is, essentially, to turn text \
into data for analysis, via application of natural language processing (NLP) and analytical \
methods.A typical application is to scan a set of documents written in a natural language and \
either model the document set for predictive classification purposes or populate a database \
or search index with the information extracted."## 删除停用词
words=word_tokenize(text)
stopword=stopwords.words("english")
print(len(words))
new_words=[word for word in words if word notin stopword]
print(len(new_words))
## 删除标点符号
new_words_2=[word for word in new_words if word notin string.punctuation]
print(len(new_words_2))
from nltk import stem
input_words = ['movies','dogs','planes','flowers','flies','fries','fry','weeks',
'planted','running','throttle']
# Porter词提取
porter=stem.PorterStemmer()
[porter.stem(word) for word in input_words]
from nltk import stem
input_words = ['movies','dogs','planes','flowers','flies','fries','fry','weeks',
'planted','running','throttle']
wordnet_lemm=stem.WordNetLemmatizer()
[wordnet_lemm.lemmatize(word) for word in input_words]
##加载库from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
#加载文本
text = "Text mining, also referred to as text data mining, roughly equivalent to text analytics,\
refers to the process of deriving high-quality information from text. High-quality information is \
typically derived through the devising of patterns and trends through means such as statistical \
pattern learning. Text mining usually involves the process of structuring the input text \
(usually parsing, along with the addition of some derived linguistic features and the removal \
of others, and subsequent insertion into a database), deriving patterns within the structured data, \
and finally evaluation and interpretation of the output. 'High quality' in text mining usually \
refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks \
include text categorization, text clustering, concept/entity extraction, production of granular \
taxonomies, sentiment analysis, document summarization, and entity relation modeling \
(i.e., learning relations between named entities).Text analysis involves information retrieval, \
lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, \
information extraction, data mining techniques including link and association analysis, \
visualization, and predictive analytics. The overarching goal is, essentially, to turn text \
into data for analysis, via application of natural language processing (NLP) and analytical \
methods.A typical application is to scan a set of documents written in a natural language and \
either model the document set for predictive classification purposes or populate a database \
or search index with the information extracted."#将给定的文本划分为句子
sentence=sent_tokenize(text)
#生成特征向量的代码
count_v=CountVectorizer()
tdm=count_v.fit_transform(sentence)
#删除停用词
stopwords=stopwords.words("english")
count_v_sw=CountVectorizer(stop_words=stopwords)
sw_tdm=count_v_sw.fit_transform(sentence)
## 使用ngrams方法
count_v_ngram=CountVectorizer(stop_words=stopwords,ngram_range=(1,2))
ngram_tdm=count_v_ngram.fit_transform(sentence)
#加载库from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
#加载数据
text = "Text mining, also referred to as text data mining, roughly equivalent to text analytics,\
refers to the process of deriving high-quality information from text. High-quality information is \
typically derived through the devising of patterns and trends through means such as statistical \
pattern learning. Text mining usually involves the process of structuring the input text \
(usually parsing, along with the addition of some derived linguistic features and the removal \
of others, and subsequent insertion into a database), deriving patterns within the structured data, \
and finally evaluation and interpretation of the output. 'High quality' in text mining usually \
refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks \
include text categorization, text clustering, concept/entity extraction, production of granular \
taxonomies, sentiment analysis, document summarization, and entity relation modeling \
(i.e., learning relations between named entities).Text analysis involves information retrieval, \
lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, \
information extraction, data mining techniques including link and association analysis, \
visualization, and predictive analytics. The overarching goal is, essentially, to turn text \
into data for analysis, via application of natural language processing (NLP) and analytical \
methods.A typical application is to scan a set of documents written in a natural language and \
either model the document set for predictive classification purposes or populate a database \
or search index with the information extracted."#抽取句子
sentences=sent_tokenize(text)
#创建一个矩阵保存词频和文档频率
stopwords=stopwords.words("english")
count_v=CountVectorizer(stop_words=stopwords)
tdm=count_v.fit_transform(sentences)
#计算TFDIF值
tfdif=TfidfTransformer()
tdm_tfdif=tfdif.fit_transform(tdm)
Jaccard_distance=1−len(intersection(x,y)len(Union(x,y)) J a c c a r d _ d i s t a n c e = 1 − l e n ( i n t e r s e c t i o n ( x , y ) l e n ( U n i o n ( x , y ) )
Si=yi−ximax(xi,yi S i = y i − x i m a x ( x i , y i
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt
#生成随机数defget_random_data():
x_1 = np.random.normal(loc=0.2,scale=0.2,size=(100,100))
x_2 = np.random.normal(loc=0.9,scale=0.1,size=(100,100))
x = np.r_[x_1,x_2]
return x
# 定义生成簇函数defform_clusters(x,k):
model=KMeans(n_clusters=k,init="random")
model.fit(x)
labels=model.labels_
print(labels)
sh_scores=silhouette_score(x,labels)
return sh_scores
#给定不同的K值调用上述函数
x=get_random_data()
sh_scores=[]
for i in range(1,5):
sh_scores.append(form_clusters(x,i+1))
#绘制不同簇时的轮廓系数图像
no_clusters=[i+1for i in range (1,5)]
plt.plot(no_clusters,sh_scores)
plt.title("cluster quality")
#生成预测报告
predicted_y = [find_class_id(instance,p_vectors) for instance in x ]
from sklearn.metrics import classification_report
print(classification_report(y,predicted_y,target_names=['Iris-Setosa','Iris-Versicolour', 'Iris-Virginica']))
import numpy as np
import matplotlib.pyplot as plt
#产生正常数据
normal_data=np.random.randn(90,1)
#产生异常点
outlier_data=np.random.uniform(low=-9,high=9,size=(10,1))
#结合数据
total_data=np.r_[normal_data,outlier_data]
plt.plot(range(len(total_data)),total_data,'b')
[]
什么是绝对中位差
绝对中位差实际求法是用原数据减去中位数后得到的新数据的绝对值的中位数。
MAD=mediani|Xi−median(X)| M A D = m e d i a n i | X i − m e d i a n ( X ) |
但绝对中位差常用来估计标准差,估计标准差=1.4826*绝对中位差。R语言中返回的是估计的标准差
#用中位差标注异常点
median=np.median(total_data)
mad=1.4826*np.median(np.abs(total_data-median))
## 确定上下界限
low_limit=median-3*mad
high_limit=median+3*mad
outlier=[]
outlier_index=[]
for i in range(len(total_data)):
if total_data[i]or total_data[i]>high_limit:
outlier.append(total_data[i])
outlier_index.append(i)
plt.scatter(range(len(total_data)),total_data,c='b')
plt.scatter(outlier_index,outlier,c='r')
plt.axhline(low_limit,ls='--')
plt.axhline(high_limit,ls='--')
# 3倍标准差
mean=np.mean(total_data)
std=np.std(total_data)
low_limit=mean-3*std
high_limit=mean+3*std
outliers=[]
outlier_index=[]
for i in range(len(total_data)):
if total_data[i]>high_limit or total_data[i]'b')
plt.scatter(outlier_index,outliers,c='r')
plt.axhline(low_limit,ls='--')
plt.axhline(high_limit,ls='--')
#制造异常点from collections import defaultdict
import numpy as np
instances = np.matrix([[0,0],[0,1],[1,1],[1,0],[5,0]])
import numpy as np
import matplotlib.pyplot as plt
x = np.squeeze(np.asarray(instances[:,0]))
y = np.squeeze(np.asarray(instances[:,1]))
plt.scatter(x,y)
plt.show()
##计算各点两两之间的距离
k=2from sklearn.metrics import pairwise_distances
dist=pairwise_distances(instances,metric='manhattan')
# 计算K距离,使用heapq来获得k最近邻# Calculate K distanceimport heapq
k_distance = defaultdict(tuple)
# For each data pointfor i in range(instances.shape[0]):
# Get its distance to all the other points.# Convert array into list for convienience
distances = dist[i].tolist()
# Get the K nearest neighbours
ksmallest = heapq.nsmallest(k+1,distances)[1:][k-1]
# Get their indices
ksmallest_idx = distances.index(ksmallest)
# For each data point store the K th nearest neighbour and its distance
k_distance[i]=(ksmallest,ksmallest_idx)
defall_indices(value, inlist):
out_indices = []
idx = -1whileTrue:
try:
idx = inlist.index(value, idx+1)#idx+1表示查找的起点位置,这样就可以避免每次查找的时候都是第一个匹配
out_indices.append(idx)
except ValueError:
breakreturn out_indices
# Calculate K distance neighbourhoodimport heapq
k_distance_neig = defaultdict(list)
# For each data pointfor i in range(instances.shape[0]):
# Get the points distances to its neighbours
distances = dist[i].tolist()
print( "k distance neighbourhood",i)
print (distances)
# Get the 1 to K nearest neighbours
ksmallest = heapq.nsmallest(k+1,distances)[1:]
print (ksmallest)
ksmallest_set = set(ksmallest)
print (ksmallest_set)
ksmallest_idx = []
# Get the indices of the K smallest elementsfor x in ksmallest_set:
ksmallest_idx.append(all_indices(x,distances))
# Change a list of list to list
ksmallest_idx = [item for sublist in ksmallest_idx for item in sublist]
# For each data pont store the K distance neighbourhood
k_distance_neig[i].extend(zip(ksmallest,ksmallest_idx))
#计算可达距离和LRD
local_reach_density = defaultdict(float)
for i in range(instances.shape[0]):
# LRDs numerator, number of K distance neighbourhood
no_neighbours = len(k_distance_neig[i])
denom_sum = 0# Reachability distance sumfor neigh in k_distance_neig[i]:
# maximum(K-Distance(P), Distance(P,Q))
denom_sum+=max(k_distance[neigh[1]][0],neigh[0])
local_reach_density[i] = no_neighbours/(1.0*denom_sum)
lof_list =[]
#Local Outlier Factorfor i in range(instances.shape[0]):
lrd_sum = 0
rdist_sum = 0for neigh in k_distance_neig[i]:
lrd_sum+=local_reach_density[neigh[1]]
rdist_sum+=max(k_distance[neigh[1]][0],neigh[0])
lof_list.append((i,lrd_sum*rdist_sum))
java中最常用jar包的用途
jar包用途axis.jarSOAP引擎包commons-discovery-0.2.jar用来发现、查找和实现可插入式接口,提供一些一般类实例化、单件的生命周期管理的常用方法.jaxrpc.jarAxis运行所需要的组件包saaj.jar创建到端点的点到点连接的方法、创建并处理SOAP消息和附件的方法,以及接收和处理SOAP错误的方法. w
创建图表事件监听非常简单:首先是通过addEventListener('监听类型',js监听方法)添加事件监听,然后在js监听方法中定义具体监听逻辑。
以钻取操作为例,当用户点击图表某一个point的时候弹出point的name和value,代码如下:
<script>
//创建AnyChart
var chart = new AnyChart();
//添加钻取操作&quo
我们来看下面的例子:
create or replace view testview
as
select empno,ename from emp where ename like ‘M%’
with check option;
这里我们创建了一个视图,并使用了with check option来限制了视图。 然后我们来看一下视图包含的结果:
select * from testv