存储海量文件,得启用 Auto-Sharding。
> admin.runCommand({ enablesharding: test })
> admin.runCommand({ shardcollection: test.fs.chunks, key:{files_id:1} })
PyMongo API 中有个 GridFS 对象,使用方法很简单。
>>> from pymongo import *
>>> from pymongo.objectid import ObjectId
>>> from gridfs import *
>>> from pprint import pprint
>>> conn = Connection()
>>> db = conn.test
>>> gfs = GridFS(db)
随便找个文件存到 GridFS 中,除了必须的 filename 外,还可以附加任意属性。
>>> with open("/home/yuhen/a.txt", "r") as file:
... id = gfs.put(file.read(), filename = "/xxx/xxx/a.txt", upload = "q.yuhen", abc = 123)
... print id
...
4c85e1f8499b144773000000
>>> gfs.list()
[u'/xxx/xxx/a.txt']
用 mongo 看看数据库中存储的具体信息。
$ ./mongo
MongoDB shell version: 1.6.2
connecting to: test
> show collections
fs.chunks
fs.files
system.indexes
> db.fs.files.find()
{
"_id" : ObjectId("4c85e1f8499b144773000000"),
"abc" : 123,
"chunkSize" : 262144,
"upload" : "q.yuhen",
"filename" : "/xxx/xxx/a.txt",
"length" : 14,
"uploadDate" : "Tue Sep 07 2010 14:55:52 GMT+0800 (CST)",
"md5" : "284d1d15a9d681b288cb5915ada39f53"
}
> db.fs.chunks.find()
{
"_id" : ObjectId("4c85e1f8499b144773000001"),
"n" : 0,
"data" : BinData(2,"DgAAAGFiY2VmZywgaGVsbG8K"),
"files_id" : ObjectId("4c85e1f8499b144773000000")
}
我们可以用 file_id 读取文件数据(包括文件内容和附加属性)。
>>> with open("/home/yuhen/a2.txt", "w") as file:
... out = gfs.get(ObjectId("4c85e1f8499b144773000000"))
... file.write(out.read())
... pprint(dir(out))
... pprint(out._file.items())
... print out.name
... print out.upload
... print out.abc
...
[
...,
'_file',
'_id',
'aliases',
'chunk_size',
'content_type',
'length',
'md5',
'metadata',
'name',
'read',
'seek',
'tell',
'upload_date'
]
[
(u'abc', 123),
(u'chunkSize', 262144),
(u'upload', u'q.yuhen'),
(u'filename', u'/xxx/xxx/a.txt'),
(u'length', 14),
(u'uploadDate', datetime.datetime(2010, 9, 7, 6, 55, 52, 611000)),
(u'_id', ObjectId('4c85e1f8499b144773000000')),
(u'md5', u'284d1d15a9d681b288cb5915ada39f53')
]
/xxx/xxx/a.txt
q.yuhen
123
还可以用 filename 读取文件。在应用中我们通常要确保 filename 唯一性。
>>> with open("/home/yuhen/a3.txt", "w") as file:
... out = gfs.get_last_version("/xxx/xxx/a.txt")
... file.write(out.read())
... print out.name, out.length
...
/xxx/xxx/a.txt 14
用相同的 filename 存储同一文件的不同版本是允许的,数据库中会保留所有历史数据。但用 get_last_version(filename) 只能取回最后一次的更新数据,可以直接从 db.fs.files 中查询 file_id 来获取不同版本内容。
>>> with open("/home/yuhen/a.txt", "r") as file:
... id = gfs.put(file.read(), filename = "/xxx/xxx/a.txt", upload = "q.yuhen", abc = 456)
... print id
...
4c85e70a499b144773000004
>>> gfs.list()
[u'/xxx/xxx/a.txt']
>>> with open("/home/yuhen/a3.txt", "w") as file:
... out = gfs.get_last_version("/xxx/xxx/a.txt")
... file.write(out.read())
... print out.name, out.length, out.abc
...
/xxx/xxx/a.txt 22 456
>>> for f in db.fs.files.find({ "filename":"/xxx/xxx/a.txt" }):
... print f["_id"], f["length"], f["abc"]
...
4c85e70a499b144773000004 22 456
4c85e1f8499b144773000000 14 123
GridFS 还提供了 exists、delete 等方法。delete 按 file_id 删除,也就是说是文件的最后一个版本。
>>> def clear():
... for f in gfs.list():
... id = gfs.get_last_version(f)._id
... print id
... if gfs.exists(id): gfs.delete(id)
...
>>> clear()
4c85e70a499b144773000004
>>> gfs.list()
[u'/xxx/xxx/a.txt']
>>> clear()
4c85e1f8499b144773000000
>>> gfs.list()
[]