pandas是一个很不错的框架,对于格式化的数据处理极为便利,最近比较频繁的使用到SQL语句,由于开发语言使用的是Python自然想找一下是否有这方面的库,于是乎还真的被我给找到了,pandasql就是这样的一个库,当然Python从来不缺乏第三方的库,比如进行SQL处理的时候有Mysql相关的,有sqlite相关的等等,就不一一枚举了,pandasql好在是可以与pandas完美的结合,会减少很多麻烦,这里就简单的学习一下如何使用pandasql,欢迎交流。
这里是pandasql的一些资料:
官方博客
github地址
pandas.read_csv参数使用 使用样例 dataframe使用
pandasql使用
下面是简单的使用讲解,由于使用很简单,所以不多加解释,解释的都在代码注释中了
#!usr/bin/env python
#encoding:utf-8
'''
__Author__:沂水寒城
功能:pandasql库使用样例
'''
import re
import csv
import pandas as pd
from sklearn.datasets import load_iris
from pandasql import sqldf
def pysqldf(query):
'''
使用该函数后可以不再添加locals()参数
'''
return sqldf(query, globals())
def iris_data_test():
'''
iris数据实验
'''
iris=load_iris() #导入数据
iris_df=pd.DataFrame(iris.data, columns=iris.feature_names) #读入数据,形成表
global iris_df
iris_df.columns=[re.sub("[() ]", "", col) for col in iris_df.columns] #去除属性名称中的括号和空格
print '---------------------------------------查看数据描述--------------------------------------'
print 'iris_df'
print iris_df
print 'iris.feature_names'
print iris.feature_names
print 'iris_df.columns'
print iris_df.columns
print '---------------------------------------------------------------------------------------'
query1="select * from iris_df;"
query2="select sepalwidthcm from iris_df limit 60;"
print '-------------------------------------------全局查询语句---------------------------------------------'
print 'query1查询结果:'
print pysqldf(query1)
print 'query2查询结果:'
print pysqldf(query2)
query3="select * from iris_df where sepalwidthcm>4.2;"
print '----------------------------------------query3------------------------------------------------'
print query3
print 'query3查询结果:'
print pysqldf(query3)
query4="select * from iris_df where petallengthcm*petalwidthcm>0.2 and sepallengthcm*sepalwidthcm>20;"
print '----------------------------------------query4------------------------------------------------'
print query4
print 'query3查询结果:'
print pysqldf(query4)
def random_data_test():
'''
使用随机数据测试
'''
data_matrix=[['zhaoliang','180','160','pingpang','banana','Kobe','3.4'],
['wangliang','190','180','tennis','apple','James','3.1'],
['liliang','165','150','football','strawberry','James','3.3'],
['danliang','175','150','basketball','orange','Kobe','3.5'],
['chengliang','186','145','swim','banana','Beke','2.9'],
['lvliang','186','178','run','monkey','Ouwen','3.0'],
['xinliang','166','150','jump','li','paul','3.1']]
feature_names=['xm','sg','tz','ah','acsg','xhmx','km']
random_df=pd.DataFrame(data_matrix, columns=feature_names)
print 'random_df'
print random_df
global random_df
query1="select * from random_df where km>3;"
print '----------------------------------------query1------------------------------------------------'
print query1
print 'query1查询结果:'
print pysqldf(query1)
query2="select a.xm, a.xhmx, b.xm, b.xhmx from random_df a left outer join random_df b on a.sg=b.sg;"
print '----------------------------------------query2------------------------------------------------'
print query2
print 'query2查询结果:'
print pysqldf(query2)
query3="select a.xm, b.xm from random_df a left outer join random_df b on a.km=b.km;"
print '----------------------------------------query3------------------------------------------------'
print query3
print '查询结果:'
print pysqldf(query3)
query4="select xm, sg, ah, km from random_df where tz>150;"
print '----------------------------------------query4------------------------------------------------'
print query4
print 'query4查询结果前3条记录:'
print pysqldf(query4).head(3)
if __name__ == '__main__':
iris_data_test()
print '*-'*80
random_data_test()
---------------------------------------查看数据描述--------------------------------------
iris_df
sepallengthcm sepalwidthcm petallengthcm petalwidthcm
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4
6 4.6 3.4 1.4 0.3
7 5.0 3.4 1.5 0.2
8 4.4 2.9 1.4 0.2
9 4.9 3.1 1.5 0.1
10 5.4 3.7 1.5 0.2
11 4.8 3.4 1.6 0.2
12 4.8 3.0 1.4 0.1
13 4.3 3.0 1.1 0.1
14 5.8 4.0 1.2 0.2
15 5.7 4.4 1.5 0.4
16 5.4 3.9 1.3 0.4
17 5.1 3.5 1.4 0.3
18 5.7 3.8 1.7 0.3
19 5.1 3.8 1.5 0.3
20 5.4 3.4 1.7 0.2
21 5.1 3.7 1.5 0.4
22 4.6 3.6 1.0 0.2
23 5.1 3.3 1.7 0.5
24 4.8 3.4 1.9 0.2
25 5.0 3.0 1.6 0.2
26 5.0 3.4 1.6 0.4
27 5.2 3.5 1.5 0.2
28 5.2 3.4 1.4 0.2
29 4.7 3.2 1.6 0.2
.. ... ... ... ...
120 6.9 3.2 5.7 2.3
121 5.6 2.8 4.9 2.0
122 7.7 2.8 6.7 2.0
123 6.3 2.7 4.9 1.8
124 6.7 3.3 5.7 2.1
125 7.2 3.2 6.0 1.8
126 6.2 2.8 4.8 1.8
127 6.1 3.0 4.9 1.8
128 6.4 2.8 5.6 2.1
129 7.2 3.0 5.8 1.6
130 7.4 2.8 6.1 1.9
131 7.9 3.8 6.4 2.0
132 6.4 2.8 5.6 2.2
133 6.3 2.8 5.1 1.5
134 6.1 2.6 5.6 1.4
135 7.7 3.0 6.1 2.3
136 6.3 3.4 5.6 2.4
137 6.4 3.1 5.5 1.8
138 6.0 3.0 4.8 1.8
139 6.9 3.1 5.4 2.1
140 6.7 3.1 5.6 2.4
141 6.9 3.1 5.1 2.3
142 5.8 2.7 5.1 1.9
143 6.8 3.2 5.9 2.3
144 6.7 3.3 5.7 2.5
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8
[150 rows x 4 columns]
iris.feature_names
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
iris_df.columns
Index([u'sepallengthcm', u'sepalwidthcm', u'petallengthcm', u'petalwidthcm'], dtype='object')
---------------------------------------------------------------------------------------
-------------------------------------------全局查询语句---------------------------------------------
query1查询结果:
sepallengthcm sepalwidthcm petallengthcm petalwidthcm
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4
6 4.6 3.4 1.4 0.3
7 5.0 3.4 1.5 0.2
8 4.4 2.9 1.4 0.2
9 4.9 3.1 1.5 0.1
10 5.4 3.7 1.5 0.2
11 4.8 3.4 1.6 0.2
12 4.8 3.0 1.4 0.1
13 4.3 3.0 1.1 0.1
14 5.8 4.0 1.2 0.2
15 5.7 4.4 1.5 0.4
16 5.4 3.9 1.3 0.4
17 5.1 3.5 1.4 0.3
18 5.7 3.8 1.7 0.3
19 5.1 3.8 1.5 0.3
20 5.4 3.4 1.7 0.2
21 5.1 3.7 1.5 0.4
22 4.6 3.6 1.0 0.2
23 5.1 3.3 1.7 0.5
24 4.8 3.4 1.9 0.2
25 5.0 3.0 1.6 0.2
26 5.0 3.4 1.6 0.4
27 5.2 3.5 1.5 0.2
28 5.2 3.4 1.4 0.2
29 4.7 3.2 1.6 0.2
.. ... ... ... ...
120 6.9 3.2 5.7 2.3
121 5.6 2.8 4.9 2.0
122 7.7 2.8 6.7 2.0
123 6.3 2.7 4.9 1.8
124 6.7 3.3 5.7 2.1
125 7.2 3.2 6.0 1.8
126 6.2 2.8 4.8 1.8
127 6.1 3.0 4.9 1.8
128 6.4 2.8 5.6 2.1
129 7.2 3.0 5.8 1.6
130 7.4 2.8 6.1 1.9
131 7.9 3.8 6.4 2.0
132 6.4 2.8 5.6 2.2
133 6.3 2.8 5.1 1.5
134 6.1 2.6 5.6 1.4
135 7.7 3.0 6.1 2.3
136 6.3 3.4 5.6 2.4
137 6.4 3.1 5.5 1.8
138 6.0 3.0 4.8 1.8
139 6.9 3.1 5.4 2.1
140 6.7 3.1 5.6 2.4
141 6.9 3.1 5.1 2.3
142 5.8 2.7 5.1 1.9
143 6.8 3.2 5.9 2.3
144 6.7 3.3 5.7 2.5
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8
[150 rows x 4 columns]
query2查询结果:
sepalwidthcm
0 3.5
1 3.0
2 3.2
3 3.1
4 3.6
5 3.9
6 3.4
7 3.4
8 2.9
9 3.1
10 3.7
11 3.4
12 3.0
13 3.0
14 4.0
15 4.4
16 3.9
17 3.5
18 3.8
19 3.8
20 3.4
21 3.7
22 3.6
23 3.3
24 3.4
25 3.0
26 3.4
27 3.5
28 3.4
29 3.2
30 3.1
31 3.4
32 4.1
33 4.2
34 3.1
35 3.2
36 3.5
37 3.1
38 3.0
39 3.4
40 3.5
41 2.3
42 3.2
43 3.5
44 3.8
45 3.0
46 3.8
47 3.2
48 3.7
49 3.3
50 3.2
51 3.2
52 3.1
53 2.3
54 2.8
55 2.8
56 3.3
57 2.4
58 2.9
59 2.7
----------------------------------------query3------------------------------------------------
select * from iris_df where sepalwidthcm>4.2;
query3查询结果:
sepallengthcm sepalwidthcm petallengthcm petalwidthcm
0 5.7 4.4 1.5 0.4
----------------------------------------query4------------------------------------------------
select * from iris_df where petallengthcm*petalwidthcm>0.2 and sepallengthcm*sepalwidthcm>20;
query3查询结果:
sepallengthcm sepalwidthcm petallengthcm petalwidthcm
0 5.4 3.9 1.7 0.4
1 5.8 4.0 1.2 0.2
2 5.7 4.4 1.5 0.4
3 5.4 3.9 1.3 0.4
4 5.7 3.8 1.7 0.3
5 5.5 4.2 1.4 0.2
6 7.0 3.2 4.7 1.4
7 6.4 3.2 4.5 1.5
8 6.9 3.1 4.9 1.5
9 6.3 3.3 4.7 1.6
10 6.7 3.1 4.4 1.4
11 6.7 3.0 5.0 1.7
12 6.0 3.4 4.5 1.6
13 6.7 3.1 4.7 1.5
14 6.3 3.3 6.0 2.5
15 7.1 3.0 5.9 2.1
16 7.6 3.0 6.6 2.1
17 7.3 2.9 6.3 1.8
18 7.2 3.6 6.1 2.5
19 6.5 3.2 5.1 2.0
20 6.8 3.0 5.5 2.1
21 6.4 3.2 5.3 2.3
22 7.7 3.8 6.7 2.2
23 7.7 2.6 6.9 2.3
24 6.9 3.2 5.7 2.3
25 7.7 2.8 6.7 2.0
26 6.7 3.3 5.7 2.1
27 7.2 3.2 6.0 1.8
28 7.2 3.0 5.8 1.6
29 7.4 2.8 6.1 1.9
30 7.9 3.8 6.4 2.0
31 7.7 3.0 6.1 2.3
32 6.3 3.4 5.6 2.4
33 6.9 3.1 5.4 2.1
34 6.7 3.1 5.6 2.4
35 6.9 3.1 5.1 2.3
36 6.8 3.2 5.9 2.3
37 6.7 3.3 5.7 2.5
38 6.7 3.0 5.2 2.3
39 6.2 3.4 5.4 2.3
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
random_df
xm sg tz ah acsg xhmx km
0 zhaoliang 180 160 pingpang banana Kobe 3.4
1 wangliang 190 180 tennis apple James 3.1
2 liliang 165 150 football strawberry James 3.3
3 danliang 175 150 basketball orange Kobe 3.5
4 chengliang 186 145 swim banana Beke 2.9
5 lvliang 186 178 run monkey Ouwen 3.0
6 xinliang 166 150 jump li paul 3.1
----------------------------------------query1------------------------------------------------
select * from random_df where km>3;
query1查询结果:
xm sg tz ah acsg xhmx km
0 zhaoliang 180 160 pingpang banana Kobe 3.4
1 wangliang 190 180 tennis apple James 3.1
2 liliang 165 150 football strawberry James 3.3
3 danliang 175 150 basketball orange Kobe 3.5
4 lvliang 186 178 run monkey Ouwen 3.0
5 xinliang 166 150 jump li paul 3.1
----------------------------------------query2------------------------------------------------
select a.xm, a.xhmx, b.xm, b.xhmx from random_df a left outer join random_df b on a.sg=b.sg;
query2查询结果:
xm xhmx xm xhmx
0 zhaoliang Kobe zhaoliang Kobe
1 wangliang James wangliang James
2 liliang James liliang James
3 danliang Kobe danliang Kobe
4 chengliang Beke chengliang Beke
5 chengliang Beke lvliang Ouwen
6 lvliang Ouwen chengliang Beke
7 lvliang Ouwen lvliang Ouwen
8 xinliang paul xinliang paul
----------------------------------------query3------------------------------------------------
select a.xm, b.xm from random_df a left outer join random_df b on a.km=b.km;
查询结果:
xm xm
0 zhaoliang zhaoliang
1 wangliang wangliang
2 wangliang xinliang
3 liliang liliang
4 danliang danliang
5 chengliang chengliang
6 lvliang lvliang
7 xinliang wangliang
8 xinliang xinliang
----------------------------------------query4------------------------------------------------
select xm, sg, ah, km from random_df where tz>150;
query4查询结果前3条记录:
xm sg ah km
0 zhaoliang 180 pingpang 3.4
1 wangliang 190 tennis 3.1
2 lvliang 186 run 3.0
[Finished in 2.1s]