MongoDB 的安装
MongoDB 是由 C++ 语言编写的非关系型数据库,是一个基于分布式文件存储的开源数据库系统,其内容存储形式类似 JSON 对象,它的字段值可以包含其他文档、数组及文档数组,非常灵活。
MongoDB 支持多种平台,包括 Windows、Linux、Mac OS、Solaris 等,在其官方网站 https://www.mongodb.com/download-center 均可找到对应的安装包。
- 官方网站:https://www.mongodb.com
- 官方文档:https://docs.mongodb.com
- GitHub:https://github.com/mongodb
- 中文教程:http://www.runoob.com/mongodb/mongodb-tutorial.html
MongoDB 的使用
> show dbs
admin 0.000GB
config 0.000GB
local 0.000GB
> db.runoob.insert({"name":"菜鸟教程"})
>db.site2.find()
{ "_id" : 1, "name" : "RUNOOB", "cn_name" : "菜鸟教程" }
{ "_id" : 2, "name" : "Google", "address" : "Google 搜索" }
{ "_id" : 3, "name" : "Facebook", "address" : "脸书" }
{ "_id" : 4, "name" : "Taobao", "address" : "淘宝" }
{ "_id" : 5, "name" : "Zhihu", "address" : "知乎" }
> show collections
TutorialItem
> db.TurorialItem.find()
pymongo的用法
创建表
#!/usr/bin/env Python
# coding=utf-8
import pymongo
#连接数据库
myclient = pymongo.MongoClient('mongodb://localhost:27017/')
#创建数据库
# mydb = myclient["rundb"]
#列出数据库的名称
# dblist = myclient.list_database_names()
# print(dblist)
# # dblist = myclient.database_names()
# if "runoobdb" in dblist:
# print("数据库已存在!")
#创建表或者集合
mydb = myclient['rundb']
mycol = mydb["sites"]
collist = mydb. list_collection_names()
print(collist)
#插入数据
mydict = {"name": "RUNOOB", "alexa": "10000", "url": "https://www.runoob.com"}
x = mycol.insert_one(mydict)
print(x)
print(x)
mylist = [
{"name": "Taobao", "alexa": "100", "url": "https://www.taobao.com"},
{"name": "QQ", "alexa": "101", "url": "https://www.qq.com"},
{"name": "Facebook", "alexa": "10", "url": "https://www.facebook.com"},
{"name": "知乎", "alexa": "103", "url": "https://www.zhihu.com"},
{"name": "Github", "alexa": "109", "url": "https://www.github.com"}
]
x = mycol.insert_many(mylist)
# 输出插入的所有文档对应的 _id 值
print(x.inserted_ids)
mycol = mydb["site2"]
mylist = [
{"_id": 1, "name": "RUNOOB", "cn_name": "菜鸟教程"},
{"_id": 2, "name": "Google", "address": "Google 搜索"},
{"_id": 3, "name": "Facebook", "address": "脸书"},
{"_id": 4, "name": "Taobao", "address": "淘宝"},
{"_id": 5, "name": "Zhihu", "address": "知乎"}
]
x = mycol.insert_many(mylist)
# 输出插入的所有文档对应的 _id 值
print(x.inserted_ids)
查询表
#!/usr/bin/env Python
# coding=utf-8
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["rundb"]
mycol = mydb["sites"]
for x in mycol.find():
print(x)
for x in mycol.find({},{ "_id": 0, "name": 1, "alexa": 1 }):
print(x)
#条件查询
myquery = {"name": "RUNOOB"}
mydoc = mycol.find(myquery)
for x in mydoc:
print(x)
#高级查询
myquery = {"name": {"$regex": "^R"}}
mydoc = mycol.find(myquery)
for x in mydoc:
print(x)
myresult = mycol.find().limit(3)
# 输出结果
for x in myresult:
print(x)
scrapy爬虫存储到mongo数据库
class MongoPipeline(object):
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self,spider):
self.client=pymongo.MongoClient(self.mongo_uri)
self.db=self.client[self.mongo_db]
def process_item(self,item,spider):
#将类名作为表名
name=item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()