mycol.remove({"city_url":{"$regex":"fang"}})
myxx = {'id': items['id'], 'province': items['province'], 'city': items['city']}
mydoc =companycol.count_documents(myxx)
for i in mycol.find({"province":"陕西省"}):
i.update({"province":"陕西省"}, {'$rename': {'updateDate': 'priceMonth'}}, False, True)
list_=[i["_id"] for i in list(mycol.find())]
len(list_)
from multiprocessing.dummy import Pool as ThreadPool
def pross(id_):
mycol.update({'_id':id_}, {"$set":{'center':处理函数(list(mycol.find({'_id':id_}))[0])}}, False, True)
pool = ThreadPool(10)
pool.map(pross,list_)
pool.close()
pool.join()
db.anjuke_second_hand_house_price_0104.find(
{"referencePriceList": [],"projectName":"子云溪苑"}# 查询条件
).snapshot().forEach( function (elem) { db.anjuke_second_hand_house_price_0104.update( { _id: elem._id },
{ $set: { referencePriceList: [{"date": elem.priceMonth, "price": elem.referencePrice }]} } ); } );
@logger.catch
def mongoDB_duplicates_inster(mycolInsert,mycolInserted,Primary_key):
# ls_df = pd.DataFrame(list(mycolInserted.find()))
T1 = time.perf_counter()
mycolInsert_df=pd.DataFrame(mycolInsert.find()).drop_duplicates(Primary_key)
# mycolInsert_df.index=list(range(len(mycolInsert_df)))
mycolInserted_df=pd.DataFrame(mycolInserted.find())
not_duplicate_to_database_dict=pd.concat([mycolInsert_df,mycolInserted_df,mycolInserted_df]).drop_duplicates(Primary_key,keep=False).to_dict(orient='records')
T2 = time.perf_counter()
print('程序运行时间:%s毫秒' % ((T2 - T1) * 1000))
return not_duplicate_to_database_dict
from multiprocessing.dummy import Pool as ThreadPool
second_hand_house_price_total_bendi = mydb["second_hand_house_price_total"]# collection集合(类似SQL的表)
not_duplicate_to_database_dict1=mongoDB_duplicates_inster(second_hand_house_price_total_bendi,second_hand_house_price_total,["city","projectName","priceMonth"])
list_1=not_duplicate_to_database_dict1#list(mycol1.find())
def pross(item):
second_hand_house_price_total.insert_one(item)#.update({'_id':id_}, {'$rename': {'updateDate': 'priceMonth'}}, False, True)
pool = ThreadPool(10)
pool.map(pross,list_1)
pool.close()
pool.join()
[{'certId': 'D237052106',
'pubDate': '2022-06-22',
'endDate': '2022-12-31',
'certType': '建筑业企业资质',
'pubDepartment': '山东省住房和城乡建设厅',
'certName': ['建筑装修装饰工程专业承包二级',
'消防设施工程专业承包一级',
'防水防腐保温工程专业承包二级',
'建筑机电安装工程专业承包一级',
'地基基础工程专业承包一级',
'建筑工程施工总承包一级']},
{'certId': 'D337052103',
'pubDate': '2022-06-20',
'endDate': '2022-12-31',
'certType': '建筑业企业资质',
'pubDepartment': '日照市行政审批服务局',
'certName': ['环保工程专业承包三级', '钢结构工程专业承包三级', '水利水电工程施工总承包三级', '市政公用工程施工总承包三级']},
{'certId': 'E237046852',
'pubDate': '2022-10-31',
'endDate': '2027-10-31',
'certType': '监理资质',
'pubDepartment': '山东省住房和城乡建设厅',
'certName': ['工程监理房屋建筑工程专业甲级']}]
db.表名.aggregate(
[
{$match: {"Housing_type": {$ne:[]},//"projectName": "武汉雅居乐花园别墅",} //查询数组类型字段,根据需求可更改},
{$sort: {"_id": 1}}, //排序
{ $group:{_id:{province:'$province',region:'$region',city:'$city',projectName:'$projectName',dataSource:'$dataSource'
},count:{$sum:1}
,dups:{$addToSet:'$_id'}
}
},
{ $match:{count:{$gt:1}}},
],
{ allowDiskUse:true})
.forEach(function(doc){
doc.dups.shift();//表示从数组第一个值开始删除;作用是剔除重复数据其中一个_id,让后面的删除语句不会删除所有数据
db.表名.remove({_id:{$in:doc.dups}});
})
db.getCollection('approval_sell_presell').aggregate([
//{$match:{'documentNumber':'深房许字(2021)宝安017号'}},
{$unwind:'$buildingInformation'},
{$group:{_id:['$documentNumber','$buidingInformation.buildingName'],ct:{$sum:1}}},
{$match:{ct:{$gt:1}}}
//{}
// { $group: { _id : '$buildingInformation.buildingName', count: { $sum : 1 } } },
// { $match: { count: { $gt : 1} } }
])
这是一个 MongoDB 的聚合操作,它将从"approval_sell_presel" 集合中查询文档,并对这些文档进行分组、统计和过滤。具体来讲,它会执行以下步骤:
1.Sunwind: 将文档中的某个数组展开,每个数组元素都会作为一个新的文档输出
2.Sgroup: 将文档分组,根据 buildingName 字段进行分组,并统计每组文档的数量。
3.$match: 过滤文档,只保留 count 字段大于1的文档。
整个聚台操作的目的是统计每个楼栋的预售证数量,并过滤出预售证数量大于1的楼栋。注意: 这个聚操作中,Smatch;documentNumberu6df1u623fu8bb8u5b57uf082021uf09u5b9du5b89017u537) 这一行是注释,不会对聚合结果产生累
for r in self.new_collection.find({'city': '哈尔滨市'}, no_cursor_timeout=True):
dnr = r.get('documentNumber')
pne = r.get('projectName')
region = r.get('region')
myxx = {'documentNumber': dnr, 'projectName': pne, 'city': '哈尔滨市', 'province': '黑龙江省', 'region': region}
mydoc = self.new_collection.count_documents(myxx)
if mydoc > 1:
new_msg = self.new_collection.find_one(myxx, sort=([('pubDate', pymongo.DESCENDING), ('_id', pymongo.DESCENDING)]))
# pde = new_msg.get('pubDate')
nid = new_msg.get('_id')
self.new_collection.delete_many({'documentNumber': dnr, 'projectName': pne, 'city': '哈尔滨市', 'province': '黑龙江省', 'region': region, '_id': {'$ne': nid}})
print(r['documentNumber'])
db.getCollection("Enterprise_qualification").aggregate([
//{$match:{'documentNumber':'深房许字(2021)宝安017号'}},
{$unwind:'$certList'},//将文档中的certList全部展开,每个数组元素都会作为一个新的文档输出
{$group:{_id:['$corpId','$certList.certId'],//以corpId和certList.certId作为主键进行分组聚合
ct:{$sum:1}}},//找到重复的元素
{$match:{ct:{$gt:1}}}
])
//uniscId是公司统一信用编码
db.getCollection("Enterprise_qualification").aggregate([
{$match:{'uniscId':'91110115102058481M'}},
//{$match:{
//'enterpriseName':'91110115102058481M'}},
//{$match:{'documentNumber':'深房许字(2021)宝安017号'}},
{$unwind:'$certList'},//将文档中的certList全部展开,每个数组元素都会作为一个新的文档输出
{$group:{_id:['$uniscId','$certList.certId'],//以corpId和certList.certId作为主键进行分组聚合
ct:{$sum:1}}},//找到重复的元素
{$match:{ct:{$gt:1}}}
])
{
"legalMan": "程伟志",
"corpId": "002105291258786099",
"regionFullname": "安徽省-黄山市",
"enterpriseName": "黄山德弘建设工程有限公司",
"uniscId": "91341023MA2MRW3A6E",
"districtCode": "341000",
"address": "碧阳镇渔亭路南侧山水同和小区A10-1",
"certList": [
{
"certId": "D234520668",
"pubDate": "2022-12-29",
"endDate": "2026-05-24",
"certType": "建筑业企业资质",
"pubDepartment": "安徽省住房和城乡建设厅",
"certName": [
"古建筑工程专业承包二级",
"市政公用工程施工总承包二级",
"建筑装修装饰工程专业承包二级",
"环保工程专业承包二级"
]
},
{
"certId": "D234520668",
"pubDate": "2022-09-23",
"endDate": "2026-05-24",
"certType": "建筑业企业资质",
"pubDepartment": "安徽省住房和城乡建设厅",
"certName": [
"市政公用工程施工总承包二级",
"建筑装修装饰工程专业承包二级"
]
},
{
"certId": "D234520668",
"pubDate": "2021-06-09",
"endDate": "2026-05-24",
"certType": "建筑业企业资质",
"pubDepartment": "安徽省住房和城乡建设厅",
"certName": [
"建筑装修装饰工程专业承包二级"
]
},
{
"certId": "D334101149",
"pubDate": "2020-06-11",
"endDate": "2022-12-31",
"certType": "建筑业企业资质",
"pubDepartment": "黄山市住房和城乡建设局",
"certName": [
"建筑工程施工总承包三级"
]
},
{
"certId": "D334101149",
"pubDate": "2020-06-11",
"endDate": "2022-12-31",
"certType": "建筑业企业资质",
"pubDepartment": "黄山市住房和城乡建设局",
"certName": [
"建筑工程施工总承包三级"
]
},
{
"certId": "D334101149",
"pubDate": "2020-06-11",
"endDate": "2022-12-31",
"certType": "建筑业企业资质",
"pubDepartment": "黄山市住房和城乡建设局",
"certName": [
"建筑工程施工总承包三级"
]
}
],
"pubDate": "2022-12-29"
}
db.getCollection("数据表").aggregate([
{'$match':{'buildingInformation': {'$ne': [], }}},
{$unwind:'$buildingInformation'},//将文档中的certList全部展开,每个数组元素都会作为一个新的文档输出
{$group:{_id:{'city': '$city', 'projectName': '$projectName', 'documentNumber': '$documentNumber', 'bName': '$buildingInformation.buildingName'},//以corpId和certList.certId作为主键进行分组聚合
ct:{$sum:1}}},//找到重复的元素
{$match:{ct:{$gt:1}}},
{$group: {_id: '$_id.city', city2: {'$sum': 1}}},
{$match: {city2: {$gt:1}}}
], { allowDiskUse:true})
合并去重相同,排序企业资质的发布时间,得到最终的资质列表数据
new_collection = new_db['Enterprise_qualification']#
# 合并相同企业资质编号的资质证书名称
def list_hebing(list1):
list2=[]
for k in list1:
list2+=k
list2=sorted(list(set(list2)))
return list2
import pandas as pd
lilistmongodb=list(new_collection.aggregate([
# {'$match':{'uniscId':'91110115102058481M'}},
{'$unwind':'$certList'},
{'$group':{'_id':['$uniscId','$certList.certId'],'ct':{'$sum':1}}},
{'$match':{'ct':{'$gt':1}}}
]))
lilistmongodb=list(set([i['_id'][0] for i in lilistmongodb]))
def same_certId_def(Certificate_list):
# try:
ls_df = pd.DataFrame(Certificate_list)
ls_df = ls_df.sort_values(["pubDate"], ascending=False) # 降序排序
ls_df = ls_df.drop_duplicates(["certId", "pubDate"])
if type(ls_df["certName"][0]) == str:
ls_group = ls_df.groupby("certId").apply(
lambda x: list(x["certName"]))
group_dict = dict(ls_group)
ls_df["certName"] = ls_df["certId"]
ls_df["certName"] = ls_df["certName"].map(group_dict)
ls_df = ls_df.drop_duplicates(["certId", "endDate"])
ls_df = ls_df[['certId', 'organDate', 'endDate', 'certType', 'organName', 'certName']]
ls_df.columns = ['certId', 'pubDate', 'endDate', 'certType', 'pubDepartment', 'certName']
elif type(ls_df["certName"][0]) == list:
# ls_group = ls_df.groupby("certId").apply(
# lambda x: list(x["certName"]) if type(x["certName"]) == str else list_hebing(x["certName"]))
ls_group = ls_df.groupby("certId").apply(
lambda x: list_hebing(x["certName"]))
group_dict = dict(ls_group)
ls_df["certName"] = ls_df["certId"]
ls_df["certName"] = ls_df["certName"].map(group_dict)
ls_df = ls_df.drop_duplicates(["certId", "endDate"])
ls_df = ls_df.sort_values(["endDate"], ascending=False) # 升序排序 , ascending=False
ls_df = ls_df.drop_duplicates("certId")
ls_df = ls_df.sort_values(["endDate"]) # 升序排序 , ascending=False
l_dict = ls_df.to_dict(orient='records')
return l_dict
# except:
# print(Certificate_list)
# print("合并企业资质异常")
# print(same_certId_def(c))
print(len(lilistmongodb))
n=0
for k in lilistmongodb:
n=n+1
print(n)
result=new_collection.find_one({"uniscId": k})
# 北京兴安幕墙装饰有限公司
# print(same_certId_def(result["certList"]))
new_collection.update_one(result,{"$set":{"certList":same_certId_def(result["certList"])}}, False, True)
# list11=[""]{"_id":k["_id"]}