项目纯属学习娱乐,不用于任何商业盈利模式,首先对地图平台道一声对不起,未经容许,擅自爬取数据。
https://www.python.org/
https://pypi.org/search/
pip install urllib3
https://www.postgresql.org/download/
http://postgis.net/install/
(1)数据库初始化配置Database,安装包psycopg2
pip install psycopg2
https://blog.csdn.net/weixin_40547993/article/details/90756631
(2)数据库连接池包安装
pip install DBUtils
(1)数据库初始化和池化参数
# postgresql
db = {
'dbDriver': {
'user': 'postgres',
'password': 'postgres',
'host': 'localhost',
'port': 5432,
'dbname': 'postgres',
'application_name': 'bmapdata'
},
'poolDB': {
# 连接池允许的最大连接数,0和None表示没有限制
'maxconnections': 100,
# 初始化时,连接池至少创建的空闲连接,0表示不创建
'mincached': 2,
# 连接池中空闲的最多连接数,0和None表示没有限制
'maxcached': 5,
# 连接池中最多共享的连接数量,0和None表示全部共享
'maxshared': 20,
# 连接池中如果没有可用共享连接后,是否阻塞等待,True表示等等,
# False表示不等待然后报错
'blocking': True
}
}
(2)数据库连接池和初始化配置
import psycopg2
import importlib
import logger
import sys
from DBUtils.PooledDB import PooledDB
from DBUtils.PersistentDB import PersistentDB, PersistentDBError, NotSupportedError
sys.path.append(sys.path[0] + '/../')
# db class
class DataBase:
# init
def __init__(self, match):
try:
dbParms = importlib.import_module('config.dataSource').db
self.poolDB = PooledDB(
# 指定数据库连接驱动
creator=psycopg2,
# 连接池允许的最大连接数,0和None表示没有限制
maxconnections=5,
# 初始化时,连接池至少创建的空闲连接,0表示不创建
mincached=2,
# 连接池中空闲的最多连接数,0和None表示没有限制
maxcached=5,
# 连接池中最多共享的连接数量,0和None表示全部共享(其实没什么卵用)
maxshared=3,
# 连接池中如果没有可用共享连接后,是否阻塞等待,True表示等等,
# False表示不等待然后报错
blocking=True,
# 开始会话前执行的命令列表
setsession=[],
# ping Mysql服务器检查服务是否可用
ping=0,
**dbParms['dbDriver']
)
self.connection = self.poolDB.connection()
self.cursor = self.connection.cursor()
self.schema = match["schema"]
self.tableName = match["tableName"]
except Exception as e:
logger.error('数据库连接错误:%s' % e)
raise
# 取单条数据
def query(self, sql, data):
cur = self.cursor
cur.execute(sql, data)
return cur.fetchone()
def insert(self, sql, datas):
conn = self.connection
try:
cur = self.cursor
# 执行sql语句
cur.executemany(
sql, datas)
# 提交到数据库执行
conn.commit()
print("数据新增成功")
except Exception as e:
# 如果发生错误则回滚
conn.rollback()
logger.error('数据新增失败:%s' % e)
def update(self, sql, datas):
conn = self.connection
try:
cur = self.cursor
# 执行sql语句
cur.executemany(
sql, datas)
# 提交到数据库执行
conn.commit()
print("数据更新成功")
except Exception as e:
# 如果发生错误则回滚
conn.rollback()
logger.error('数据更新失败:%s' % e)
def delete(self, sql):
self.transact(sql)
def transact(self, sql):
conn = self.connection
try:
cur = self.cursor
# 执行sql语句
cur.execute(sql)
# 提交到数据库执行
conn.commit()
except:
# 如果发生错误则回滚
conn.rollback()
import urllib3
import json
class Urllib3Request:
def __init__(self):
self.http = urllib3.PoolManager()
# get
def urllib3Get(self, url, param):
req = self.http.request(
'GET',
url,
fields=param)
if req.status == 200:
data = json.loads(req.data.decode('utf-8'))
return data
return {}
import threading
import time
threadLock = threading.Lock()
class Thread (threading.Thread):
def __init__(self, threadId, name, delay, customRun):
threading.Thread.__init__(self)
# 线程ID
self.threadId = threadId
# 线程名称
self.name = name
# 延迟时间
self.delay = delay
# 执行函数
self.customRun = customRun
def run(self):
print("Starting " + self.name)
# 获得锁,成功获得锁定后返回True
# 可选的timeout参数不填时将一直阻塞直到获得锁定
# 否则超时后将返回False
threadLock.acquire()
self.customRun(self.name, self.delay)
# 释放锁
threadLock.release()
print("Exiting " + self.name)
if __name__ == "__main__":
poiTypeThreadsList = ['美食', '酒店', '购物', '生活服务',
'丽人', '旅游景点', '休闲娱乐', '运动健身',
'教育培训', '文化传媒', '医疗', '汽车服务',
'交通设施', '金融', '房地产', '公司企业',
'政府机构', '出入口', '自然地物']
for value in poiTypeThreadsList:
thread = Thread(poiTypeThreadsList.index(value), value,0,customRun)
thread.start()
print("Exiting Main Thread")
(1)爬虫数据格式转换
(2)数据库表实体类封装
# POI实体类
class POIEntity:
def __init__(self, result):
self.uid = result["uid"] if "uid" in result else None
self.name = result["name"] if "name" in result else None
self.address = result["address"] if "address" in result else None
self.province = result["province"] if "province" in result else None
self.city = result["city"] if "city" in result else None
self.area = result["area"] if "area" in result else None
self.street_id = result["street_id"] if "street_id" in result else None
self.tag = result["detail_info"]["tag"] if "detail_info" in result and "tag" in result["detail_info"] else None
self.type = result["detail_info"]["type"] if "detail_info" in result and "type" in result["detail_info"] else None
self.detail_url = result["detail_info"]["detail_url"] if "detail_info" in result and "detail_url" in result["detail_info"] else None
self.telephone = result["telephone"] if "telephone" in result else None
self.lng = result["location"]["lng"] if "location" in result and "lng" in result["location"] else None
self.lat = result["location"]["lat"] if "location" in result and "lat" in result["location"] else None
(3)数据入库
数据新增,判断是否已有,已有数据进行更新操作,否则新增
https://github.com/yangdengxian/BMapData
本博文属博主原创,如需参考或者转载,注明缘由,创作不易,多多支持与谅解,也希望广大技术人员相互探讨