存储海量文件,得启用 Auto-Sharding。
> admin.runCommand({ enablesharding: test }) > admin.runCommand({ shardcollection: test.fs.chunks, key:{files_id:1} })
PyMongo API 中有个 GridFS 对象,使用方法很简单。
>>> from pymongo import * >>> from pymongo.objectid import ObjectId >>> from gridfs import * >>> from pprint import pprint >>> conn = Connection() >>> db = conn.test >>> gfs = GridFS(db)
随便找个文件存到 GridFS 中,除了必须的 filename 外,还可以附加任意属性。
>>> with open("/home/yuhen/a.txt", "r") as file: ... id = gfs.put(file.read(), filename = "/xxx/xxx/a.txt", upload = "q.yuhen", abc = 123) ... print id ... 4c85e1f8499b144773000000 >>> gfs.list() [u'/xxx/xxx/a.txt']
用 mongo 看看数据库中存储的具体信息。
$ ./mongo MongoDB shell version: 1.6.2 connecting to: test > show collections fs.chunks fs.files system.indexes > db.fs.files.find() { "_id" : ObjectId("4c85e1f8499b144773000000"), "abc" : 123, "chunkSize" : 262144, "upload" : "q.yuhen", "filename" : "/xxx/xxx/a.txt", "length" : 14, "uploadDate" : "Tue Sep 07 2010 14:55:52 GMT+0800 (CST)", "md5" : "284d1d15a9d681b288cb5915ada39f53" } > db.fs.chunks.find() { "_id" : ObjectId("4c85e1f8499b144773000001"), "n" : 0, "data" : BinData(2,"DgAAAGFiY2VmZywgaGVsbG8K"), "files_id" : ObjectId("4c85e1f8499b144773000000") }
我们可以用 file_id 读取文件数据(包括文件内容和附加属性)。
>>> with open("/home/yuhen/a2.txt", "w") as file: ... out = gfs.get(ObjectId("4c85e1f8499b144773000000")) ... file.write(out.read()) ... pprint(dir(out)) ... pprint(out._file.items()) ... print out.name ... print out.upload ... print out.abc ... [ ..., '_file', '_id', 'aliases', 'chunk_size', 'content_type', 'length', 'md5', 'metadata', 'name', 'read', 'seek', 'tell', 'upload_date' ] [ (u'abc', 123), (u'chunkSize', 262144), (u'upload', u'q.yuhen'), (u'filename', u'/xxx/xxx/a.txt'), (u'length', 14), (u'uploadDate', datetime.datetime(2010, 9, 7, 6, 55, 52, 611000)), (u'_id', ObjectId('4c85e1f8499b144773000000')), (u'md5', u'284d1d15a9d681b288cb5915ada39f53') ] /xxx/xxx/a.txt q.yuhen 123
还可以用 filename 读取文件。在应用中我们通常要确保 filename 唯一性。
>>> with open("/home/yuhen/a3.txt", "w") as file: ... out = gfs.get_last_version("/xxx/xxx/a.txt") ... file.write(out.read()) ... print out.name, out.length ... /xxx/xxx/a.txt 14
用相同的 filename 存储同一文件的不同版本是允许的,数据库中会保留所有历史数据。但用 get_last_version(filename) 只能取回最后一次的更新数据,可以直接从 db.fs.files 中查询 file_id 来获取不同版本内容。
>>> with open("/home/yuhen/a.txt", "r") as file: ... id = gfs.put(file.read(), filename = "/xxx/xxx/a.txt", upload = "q.yuhen", abc = 456) ... print id ... 4c85e70a499b144773000004 >>> gfs.list() [u'/xxx/xxx/a.txt'] >>> with open("/home/yuhen/a3.txt", "w") as file: ... out = gfs.get_last_version("/xxx/xxx/a.txt") ... file.write(out.read()) ... print out.name, out.length, out.abc ... /xxx/xxx/a.txt 22 456 >>> for f in db.fs.files.find({ "filename":"/xxx/xxx/a.txt" }): ... print f["_id"], f["length"], f["abc"] ... 4c85e70a499b144773000004 22 456 4c85e1f8499b144773000000 14 123
GridFS 还提供了 exists、delete 等方法。delete 按 file_id 删除,也就是说是文件的最后一个版本。
>>> def clear(): ... for f in gfs.list(): ... id = gfs.get_last_version(f)._id ... print id ... if gfs.exists(id): gfs.delete(id) ... >>> clear() 4c85e70a499b144773000004 >>> gfs.list() [u'/xxx/xxx/a.txt'] >>> clear() 4c85e1f8499b144773000000 >>> gfs.list() []