文章目录
- MongoDB高级查询
- 数据
- 聚合函数
- $match 过滤
- $project
- $sort ,$limit,$skip
- $sample
- 可视化工具
- selenium + mongodb爬取京东商品
MongoDB高级查询
数据
from pymongo import MongoClient
import datetime
client=MongoClient('mongodb://root:[email protected]:27017')
table=client['db1']['emp']
l=[
('张飞','male',18,'20170301','teacher',7300.33,401,1),
('张云','male',78,'20150302','teacher',1000000.31,401,1),
('刘备','male',81,'20130305','teacher',8300,401,1),
('关羽','male',73,'20140701','teacher',3500,401,1),
('曹操','male',28,'20121101','teacher',2100,401,1),
('诸葛亮','female',18,'20110211','teacher',9000,401,1),
('周瑜','male',18,'19000301','teacher',30000,401,1),
('司马懿','male',48,'20101111','teacher',10000,401,1),
('袁绍','female',48,'20150311','sale',3000.13,402,2),
('张全蛋','female',38,'20101101','sale',2000.35,402,2),
('鹌鹑蛋','female',18,'20110312','sale',1000.37,402,2),
('王尼玛','female',18,'20160513','sale',3000.29,402,2),
('我尼玛','female',28,'20170127','sale',4000.33,402,2),
('杨过','male',28,'20160311','operation',10000.13,403,3),
('小龙女','male',18,'19970312','operation',20000,403,3),
('郭靖','female',18,'20130311','operation',19000,403,3),
('黄蓉','male',18,'20150411','operation',18000,403,3),
('梅超风','female',18,'20140512','operation',17000,403,3)
]
for n,item in enumerate(l):
d={
"_id":n,
'name':item[0],
'sex':item[1],
'age':item[2],
'hire_date':datetime.datetime.strptime(item[3],'%Y%m%d'),
'post':item[4],
'salary':item[5]
}
table.save(d)
聚合函数
db.emp.aggregate(
{"$group":{
"_id":"$post",
"最高工资":{"$max":"$salary"},
"最低工资":{"$min":"$salary"},
"平均工资":{"$avg":"$salary"},
"总工资":{"$sum":"$salary"},
"人数":{"$sum":1}
}}
)
db.emp.aggregate(
{"$group":{
"_id":"$post",
"第一个":{"$first":"$name"},
"最后一个":{"$last":"$name"}
}}
)
db.emp.aggregate(
{"$group":{
"_id":"$post",
"人员名单":{"$push":"$name"}
}}
)
db.emp.aggregate(
{"$group":{
"_id":"$post",
"人员名单":{"$addToSet":"$name"}
}}
)
$match 过滤
db.emp.aggregate(
{"$match":{"name":"鹌鹑蛋"}}
)
db.emp.aggregate(
{"$match":{"_id":{"$gt":3}}}
)
db.emp.aggregate(
{"$group":{
"_id":"$post",
"max_salary":{"$max":"$salary"}
}},
{"$match":{"max_salary":{"$gt":10000}}},
{"$match":{"_id":{"$ne":"teacher"}}}
)
$project
db.emp.aggregate(
{"$project":{
"name":1,
"_id":0,
"year_salary":{"$multiply":[12,"$salary"]}
}}
)
db.emp.aggregate(
{"$project":{
"name":1,
"_id":0,
"after_10_year":{"$add":[10,"$age"]}
}}
db.emp.aggregate(
{"$project":{
"name":1,
"_id":0,
"befor_10_year":{"$subtract":["$age",10]}
}}
)
db.emp.aggregate(
{"$project":{
"name":1,
"_id":0,
"year":{"$year":"$hire_date"}
}}
)
db.emp.aggregate(
{"$project":{
"name":1,
"_id":0,
"job_year":{"$subtract":[{"$year":new Date()},{"$year":"$hire_date"}]}
}}
)
db.emp.aggregate(
{"$project":{
"first_name":{"$substr":["$name",0,3]},
"_id":0
}}
)
db.emp.aggregate(
{"$project":{
"full_info":{"$concat":["$name","$post"]},
"_id":0
}}
)
s o r t , sort , sort,limit,$skip
db.emp.aggregate(
{"$sort":{"_id":1}},
{"$limit":10},
{"$skip":1}
)
$sample
db.emp.aggregate({"$sample":{"size":3}})
可视化工具
https://robomongo.org
selenium + mongodb爬取京东商品
==============================spider.py===========================
import time
from urllib.parse import urlencode
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from mongoTest import DBTool
driver = Chrome()
kw = "黄金"
par = {"enc":"utf-8","keyword":kw,"wq":kw}
kw = urlencode(par)
url = "https://search.jd.com/Search?" + kw
driver.get(url)
driver.implicitly_wait(10)
height = driver.execute_script("return document.body.clientHeight")
driver.implicitly_wait(5)
def get_datas():
ul = driver.find_element_by_class_name("gl-warp")
items = ul.find_elements_by_class_name("gl-item")
if len(items) == 60:
return items
return get_datas()
def paser_data(items):
print(len(items))
for i in items:
link = i.find_element_by_css_selector(".p-img a").get_attribute("href")
img = i.find_element_by_css_selector(".p-img a img").get_attribute("src")
if not img:
img = i.find_element_by_css_selector(".p-img a img").get_attribute("data-lazy-img")
img = "https:" + img
price = i.find_element_by_css_selector(".p-price i").text
title = i.find_element_by_css_selector(".p-name a em").text
shop_name = i.find_element_by_css_selector(".p-shop a").text
commit = i.find_element_by_css_selector(".p-commit").text
print("=========================================")
dic = {"link":link,"img":img,"title":title,"shop_name":shop_name,"commit":commit,"price":price}
DBTool.insert_data(dic)
def get_next():
next = driver.find_element_by_partial_link_text("下一页")
next.click()
driver.execute_script("""
window.scrollTo({
top: %s,
behavior: "smooth"
});""" % height)
print("11111")
time.sleep(1)
print("continue....")
items = get_datas()
paser_data(items)
driver.execute_script("""
window.scrollTo({
top: %s,
behavior: "smooth"
});""" % height)
items = get_datas()
paser_data(items)
for i in range(5):
get_next()
time.sleep(30)
driver.close()
==============================DBTool.py============================
"""
连接数据库 保存数据
"""
from pymongo import MongoClient
table = None
c = None
def connect_server():
global table,c
c = MongoClient("mongodb://root:[email protected]:27017")
table = c["jd"]["jd_data"]
print(table)
def insert_data(data):
if not table:
connect_server()
table.insert(data)
def close():
c.close()