python数据分析学习笔记八

 第八章 应用数据库

1 基于sqlite3的轻量级访问

轻盈的关系型数据库

示例代码如下:


import sqlite3

# 创建数据库联接
with sqlite3.connect(":memory:") as con:
    # 取得游标
    c = con.cursor()
    # 创建数据库表
    c.execute('''
        CREATE TABLE sensors(
                            data text,
                            city text,
                            code text,
                            sensor_id real,
                            temperature real)
    ''')

# 查询数据库表
for table in c.execute("SELECT name FROM sqlite_master WHERE type='table'"):
    print('Table', table)

# 添加记录
c.execute("INSERT INTO sensors VALUES('2016-02-27','uTRECHT','Red',42,15.14)")
# 查询所有记录
c.execute("SELECT * FROM sensors")
print(c.fetchone())
# 删除表
con.execute("DROP TABLE sensors")
# 查询数据库表
print("#of table", c.execute("SELECT COUNT (*) FROM sqlite_master WHERE type='table'").fetchone()[0])
# 关闭联接
con.close()

 

运行结果如下:

Table ('sensors',)

('2016-02-27', 'uTRECHT', 'Red', 42.0,15.14)

#of table 0

2 通过pandas访问数据库

示例代码如下:


import statsmodels.api as sm
from pandas.io.sql import read_sql
import sqlite3

# 创建数据库联接
with sqlite3.connect(":memory:") as con:
    # 取得游标
    c = con.cursor()

    # 加载数据
    data_loader = sm.datasets.sunspots.load_pandas()
    df = data_loader.data
    # 创建元组列表
    rows = [tuple(x) for x in df.values]
    # 创建一个未规定数据类型的数据表
    con.execute("CREATE TABLE sunspots (year,sunactivity)")
    # 添加多条记录
    con.executemany("INSERT INTO sunspots(year,sunactivity) VALUES (?,?)", rows)
    # 查询总记录数
    c.execute("SELECT COUNT(*) FROM sunspots")
    print(c.fetchone())
    # 删除表中记录,并显示受影响行数
    print("Deleted", con.execute("DELETE FROM sunspots where sunactivity >20").rowcount, "row")

    # 使用read_sql执行查询并返回dataFrame结果
    print(read_sql("SELECT * FROM sunspots where year <1732", con))
    # 删除数据表
    con.execute("DROP TABLE sunspots")
    c.close()

 

运行结果如下:

(309,)

Deleted 217 row

     year  sunactivity

0  1700.0          5.0

1  1701.0         11.0

2  1702.0         16.0

3  1707.0         20.0

4  1708.0         10.0

5  1709.0          8.0

6  1710.0          3.0

7  1711.0          0.0

8  1712.0          0.0

9  1713.0          2.0

10 1714.0         11.0

11 1723.0         11.0

 

3 SQLAlchemy

可以把python中的类映射为数据库中的表,类以于java中的hibernate

示例代码如下:

Alchemy_entities.py


from sqlalchemy import Column, ForeignKey, Integer, Float, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
from sqlalchemy import UniqueConstraint

# 超类
Base = declarative_base()

# 观测站表
class Station(Base):
    __tablename__ = 'station'  # 表名
    id = Column(Integer, primary_key=True# id
    name = Column(String(14), nullable=False, unique=True# 观测站名称

    def __repr__(self):
        return "Id=%d name=%s" % (self.id, self.name)


# 传感器表
class Sensor(Base):
    __tablename__ = 'sensor'  # 表名
    id = Column(Integer, primary_key=True# id
    last = Column(Integer)
    multiplier = Column(Float)
    station_id = Column(Integer, ForeignKey('station.id'))  # 外键
    station = relationship(Station)

    def __repr__(self):
        return "Id=%d last=%d multiplier=%.1f station_id=%d" % (self.id, self.last, self.multiplier, self.station_id)


if __name__ == "__main__":
    print("This script is used by another script. Run python alchemy_query.py")

 

 

Populate_db.py


from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from .alchemy_entities import Base, Sensor, Station


def populate(engine):
    # 创建DBSession对象
    Base.metadata.bind = engine
    DBSession = sessionmaker(bind=engine)
    session = DBSession()

    # 创建两个观测站
    de_bilt = Station(name='De Bilt')
    session.add(de_bilt)

    session.add(Station(name='Utrecht'))
    session.commit()
    print('Station', de_bilt)

    # 添加传感器记录
    temp_sesor = Sensor(last=20, multiplier=.1, station=de_bilt)
    session.add(temp_sesor)
    session.commit()
    print("Sensor", temp_sesor)


if __name__ == "__main__":
    print("This script is used by another script. Run python alchemy_query.py")

 

 

Alchemy_query.py


from Eight.alchemy_entities import Base, Station, Sensor
from Eight.populate_db import populate
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import os
from pandas.io.sql import read_sql

# 创建引擎
engine = create_engine('sqlite:///demo.db')
# 删除数据表
Base.metadata.drop_all(engine)
# 创建数据表
Base.metadata.create_all(engine)
populate(engine)
Base.metadata.bind = engine
DBSession = sessionmaker()
DBSession.bind = engine
session = DBSession()

# 查询station表中的第一行记录
station = session.query(Station).first()

# 查询所有station
print('all station', session.query(Station).all())
# 查询所有sensor
print('all sensor', session.query(Sensor).all())
# 查询第一个station的第一个sensor
print('query sensor by station', session.query(Sensor).filter(Sensor.station == station).one())
# 使用pandas的read_sql查询
print('read_sql all station', read_sql("SELECT * FROM station", engine.raw_connection()))

# 删除数据库,没有执行
try:
    os.remove('demo.db')
    print('Delete demo.db')
except OSError as e:
    # [WinError 32] 另一个程序正在使用此文件,进程无法访问。: 'demo.db'
    print(e)
    pass

 

运行结果如下:

Station Id=1 name=De Bilt

Sensor Id=1 last=20 multiplier=0.1station_id=1

all station [Id=1 name=De Bilt, Id=2name=Utrecht]

all sensor [Id=1 last=20 multiplier=0.1station_id=1]

query sensor by station Id=1 last=20multiplier=0.1 station_id=1

read_sql all station    id    name

0  1  De Bilt

1  2  Utrecht

[WinError 32] 另一个程序正在使用此文件,进程无法访问。: 'demo.db'

 

4 Pony ORM

Python写的orm包


from pony.orm import Database, db_session
from pandas.io.sql import to_sql
import statsmodels.api as sm

# 创建sqlite数据库
db = Database('sqlite', ':memory:')

# 加载数据并写入数据库
with db_session:
    data_loader = sm.datasets.sunspots.load_pandas()
    df = data_loader.data
    to_sql(df, "sunspots", db.get_connection())
    print(db.select("count(*) FROM sunspots"))

 

运行结果如下:

[309]

 

5 Dataset 懒人数据库

是sqlalchemy的一个包装器


import dataset
from pandas.io.sql import read_sql
from pandas.io.sql import to_sql
import statsmodels.api as sm

# 创建数据库连接
db = dataset.connect('sqlite:///:memory:')
# 创建bookstable = db["books"]
# 添加数据,在调用insert时会自动添加表模式
table.insert(dict(title="Numpy Beginner's guide", author='Ivan Idris'))
table.insert(dict(title="Numpy Cookbook", author='Ivan Idris'))
table.insert(dict(title="Learning Numpy", author='Ivan Idris'))
# 使用pandas的read_sql查询数据
print(read_sql('SELECT * FROM books', db.executable.raw_connection()))

# 加载数据
data_loader = sm.datasets.sunspots.load_pandas()
df = data_loader.data
to_sql(df, "sunspots", db.executable.raw_connection())
table = db['sunspots']

# 查询前5条数据
for row in table.find(_limit=5):
    print(row)
print("Table", db.tables)

运行结果如下:

id      author                   title

0  1  Ivan Idris  Numpy Beginner's guide

1  2  Ivan Idris          Numpy Cookbook

2  3  Ivan Idris          Learning Numpy

OrderedDict([('index', 0), ('YEAR',1700.0), ('SUNACTIVITY', 5.0)])

OrderedDict([('index', 1), ('YEAR',1701.0), ('SUNACTIVITY', 11.0)])

OrderedDict([('index', 2), ('YEAR',1702.0), ('SUNACTIVITY', 16.0)])

OrderedDict([('index', 3), ('YEAR',1703.0), ('SUNACTIVITY', 23.0)])

OrderedDict([('index', 4), ('YEAR',1704.0), ('SUNACTIVITY', 36.0)])

Table ['books', 'sunspots']

 

Process finished with exit code 0

 

6 pymongo与mongodb

 

指定数据存储的目录

Mkdir h:/data/db

Mongod --dbpath h:/data/db

 

Python安装mongodb的驱动程序

Python -m pip install pymongo

 

查看当前驱动版本号

C:\Users\Administrator>python -m pipfreeze|grep pymongo

pymongo==3.3.1

 

与mongodb 的测试数据库进行联接

示例代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/3 11:41
# @Author  : Retacn
# @Site    : 连接mongodb测试数据库
# @File    : mongodb_conn.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "[email protected]"
__status__ = "Development"

from pymongo import MongoClient
import statsmodels.api as sm
import json
import pandas as pd

# 创建数据库连接
client = MongoClient()
db = client.test_database

# 创建json并保存到数据库中
data_loader = sm.datasets.sunspots.load_pandas()
df = data_loader.data
rows = json.loads(df.T.to_json()).values()
db.sunspots.insert(rows)

#查询所有内容
cursor = db['sunspots'].find({})
df = pd.DataFrame(list(cursor))
print(df)

db.drop_collection('sunspots')

 

运行结果如下:

    SUNACTIVITY    YEAR                       _id

0          16.6  1825.0  586b24872e29db269c8854f4

1          11.0  1745.0  586b24872e29db269c8854f5

2           4.3  1856.0  586b24872e29db269c8854f6

3          26.1  1921.0  586b24872e29db269c8854f7

4          15.5  1975.0  586b24872e29db269c8854f8

5          47.8  1831.0  586b24872e29db269c8854f9

6          38.0  1955.0  586b24872e29db269c8854fa

7          14.2  1922.0  586b24872e29db269c8854fb

8          37.6  1920.0  586b24872e29db269c8854fc

9           6.4  1797.0  586b24872e29db269c8854fd

10          4.1  1798.0  586b24872e29db269c8854fe

11         28.0  1720.0  586b24872e29db269c8854ff

12        111.2  1871.0  586b24872e29db269c885500

13         83.4  1750.0  586b24872e29db269c885501

14         27.9  1963.0  586b24872e29db269c885502

15         85.9  1761.0  586b24872e29db269c885503

16         32.3  1880.0  586b24872e29db269c885504

17         70.9  1830.0  586b24872e29db269c885505

18         17.9  1985.0  586b24872e29db269c885506

19          8.6  1996.0  586b24872e29db269c885507

20         88.8  1939.0  586b24872e29db269c885508

21         16.3  1866.0  586b24872e29db269c885509

22         22.0  1746.0  586b24872e29db269c88550a

23          1.4  1913.0  586b24872e29db269c88550b

24          5.0  1812.0  586b24872e29db269c88550c

25         14.5  1800.0  586b24872e29db269c88550d

26         84.8  1780.0  586b24872e29db269c88550e

27        138.3  1837.0  586b24872e29db269c88550f

28         54.8  1858.0  586b24872e29db269c885510

29         12.2  1813.0  586b24872e29db269c885511

..          ...     ...                       ...

279       132.0  1787.0  586b24872e29db269c88560b

280         9.6  1944.0  586b24872e29db269c88560c

281        47.8  1752.0  586b24872e29db269c88560d

282       139.0  1870.0  586b24872e29db269c88560e

283         5.7  1911.0  586b24872e29db269c88560f

284        20.6  1854.0  586b24872e29db269c885610

285       121.5  1836.0  586b24872e29db269c885611

286       159.0  1959.0  586b24872e29db269c885612

287        59.1  1862.0  586b24872e29db269c885613

288        64.6  1840.0  586b24872e29db269c885614

289        66.5  1772.0  586b24872e29db269c885615

290         5.0  1902.0  586b24872e29db269c885616

291        82.9  1786.0  586b24872e29db269c885617

292        24.2  1842.0  586b24872e29db269c885618

293         4.4  1954.0  586b24872e29db269c885619

294        95.8  1860.0  586b24872e29db269c88561a

295       109.6  1938.0  586b24872e29db269c88561b

296        21.0  1724.0  586b24872e29db269c88561c

297         5.0  1700.0  586b24872e29db269c88561d

298        63.7  1883.0  586b24872e29db269c88561e

299        25.4  1886.0  586b24872e29db269c88561f

300        12.1  1899.0  586b24872e29db269c885620

301        61.5  1846.0  586b24872e29db269c885621

302        35.6  1891.0  586b24872e29db269c885622

303        79.7  1936.0  586b24872e29db269c885623

304        77.2  1861.0  586b24872e29db269c885624

305        36.3  1826.0  586b24872e29db269c885625

306         6.0  1879.0  586b24872e29db269c885626

307        26.2  1897.0  586b24872e29db269c885627

308        41.0  1794.0  586b24872e29db269c885628

 

[309 rows x 3 columns]

 

7 利用redis存储数据

是一个in-memory型的键值数据库,是c写的

示例代码如下:


import redis
import statsmodels.api as sm
import pandas as pd

# 与redis建立连接
r = redis.StrictRedis()

# 加载数据
data_loader = sm.datasets.sunspots.load_pandas()
df = data_loader.data

# 通过json字符串创建记录
data = df.T.to_json()
r.set('sunspots', data)

# 检索记录
blob = r.get('sunspots')
print(pd.read_json(blob))

运行结果如下:

            0     1     10     100   101   102    103     104     105 \

SUNACTIVITY     5   11     3    14.5   34    45    43.1   47.5    42.2  

YEAR        1700  1701  1710 1800.0  1801  1802 1803.0  1804.0  1805.0  

 

                106   ...      90      91    92     93    94      95   96   \

SUNACTIVITY    28.1  ...      89.9    66.6   60    46.9    41   21.3    16  

YEAR        1806.0   ...    1790.0 1791.0  1792  1793.0 1794  1795.0  1796  

 

                97      98     99  

SUNACTIVITY     6.4    4.1     6.8 

YEAR        1797.0  1798.0  1799.0

 

 

8 Apache Cassandra

 是结合了键值 和传统关系型数据库特性的混合型数据库,是面向列的数据库

 

Cassandra安装

1下载apache-cassandra-3.10-bin.tar.gz

2 修改cassandra.bat,添加JAVA_HOME变量

 set JAVA_HOME=C:\Java\jdk1.8.0_71

  如果本机内存不够大,可以将虚拟机内存调小.默认为2G

 -Xms512m^

 -Xmx512m^

3 python已安装略

4 修改cqlsh.bat.,设置python路径

set path =D:\Python35

5 添加环境变量

Path=F:\apache-cassandra-3.10\bin;

6 cmd执行cassandra,启动数据库

 

学习本章时未找到支持python3.5的驱动,所以使用ptyhon2.7

示列代码如下:


from cassandra import ConsistencyLevel
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement
import statsmodels.api as sm

# 与集群建立联接,并创建一个会话
cluster = Cluster()
session = cluster.connect()

# keyspace用来存储数据表的容器.
session.execute(
    "CREATE KEYSPACE IF NOT EXISTS mykeyspace WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };")
session.set_keyspace('mykeyspace')

# 创建数据表
session.execute("CREATE TABLE IF NOT EXISTS sunspots (year decimal PRIMARY KEY, sunactivity decimal);")
# 添加数据操作
query = SimpleStatement(
    "INSERT INTO sunspots (year, sunactivity) VALUES (%s, %s)",
    consistency_level=ConsistencyLevel.QUORUM)

# 加载数据
data_loader = sm.datasets.sunspots.load_pandas()
df = data_loader.data
rows = [tuple(x) for x in df.values]

# 添加数据
for row in rows:
    session.execute(query, row)

# 输出总记录数
print(session.execute("SELECT COUNT(*) FROM sunspots")._current_rows[0])

session.execute('DROP KEYSPACE mykeyspace')
cluster.shutdown()

运行结果如下:

Row(count=309)

你可能感兴趣的:(python数据分析)