商数家据爬取

  1. 数据来源
    美团
  2. 数据表结构
    商数家据爬取_第1张图片
  3. 商户信息获取
    先分区,分页获取商家列表。
    根据商家列表,获取商家详情
"""

美团商户数据
"""
import json
import time

import pymysql
import requests
import re
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
}

def get_dateil_url():
    '''
    获取商户列表
    :return:
    '''
    base_list = []
    for i in range(1,67):
        url = 'https://sh.meituan.com/meishi/pn'+str(i)+'/'
        con = requests.get(url,headers=headers).content.decode('utf-8')
        base_list = base_list + re.findall('{"poiId":(.+?),"frontImg"', con)

    with open("1.txt", 'a+') as f:
        f.write(str(base_list))

    print(base_list,'\n',len(base_list))


def detail_data():
    '''
    获取商户详情
    :return:
    '''
    url_list = ['170616710', '1603003', '4509749', '83706375', '42030772', '6098820', '105698280', '1602048', '5013244', '42911618', '5483992', '6075411', '175679821', '1479772', '41441794', '182588795', '6556103', '41361382', '182593096', '1500293', '6708285', '175467139', '1602434', '2509760', '183996221', '487892', '467232', '182518405', '40694653', '6822837', '180718166', '40843113', '4884322', '184810164', '4343272', '41487162', '165875434', '83815', '6822837', '40037319', '5450114', '1602422', '159466251', '42623489', '6499811', '179954672', '487892', '42395459', '179681991', '42623489', '5766785', '481781', '5286712', '41342232', '6396052', '41361689', '4137469', '184366145', '51424016', '1479772', '99465316', '41342232', '40650907', '175148330', '501818', '40125276', '179374426', '5716113', '40652', '184451372', '41995574', '1602925', '182362080', '41785473', '1601629', '182362080', '407840', '2524878', '166239979', '4595402', '6047190', '164747640', '393515', '2636225', '2869225', '42395459', '41591871', '179618511', '40282153', '42994980', '182044391', '2458124', '51424016', '182645402', '4800048', '51028032', '1589836', '1601856', '5766785', '162663118', '40736653', '52800270', '163573713', '5484518', '52936558', '179626885', '40803943', '6024247', '115184231', '6072992', '6763697', '82899342', '4137469', '502269', '183977783', '4221169', '4091974', '169149576', '41714424', '40833128', '177391385', '6771089', '2584136', '162067191', '40306698', '6974976', '160036349', '5120128', '64454', '169646766', '6413649', '4821461', '180042720', '404157', '5927636', '181413143', '5826011', '41186410', '185670979', '5317830', '66182', '185897386', '451599', '4614535', '182435910', '1597949', '41523803', '184949448', '5727017', '4174961', '180568546', '43308', '2469181', '133055957', '40481513', '4575793', '4589102', '5652631', '6763738', '41591501', '5816423', '6238188', '164061289', '381212', '50947470', '164061289', '52330950', '6078057', '5171189', '41728653', '40472844', '179148988', '6508710', '6004034', '105505232', '4271930', '6176854', '178875936', '442578', '40889768', '105505232', '6432844', '40575741', '178875936', '1602918', '4792019', '157744453', '41396510', '2559464', '184747421', '6245177', '5085191', '157375549', '2573040', '2554983', '185120687', '5332103', '40038566', '179475024', '6023029', '480676', '185138822', '1596186', '3246313', '182352897', '428148', '86773', '160010125', '5964850', '6923250', '175494013', '1587453', '4358096', '159235447', '40030326', '6673763', '94514398', '50985240', '6471008', '93730128', '4933157', '6805656', '181445849', '1603991', '393278', '152751398', '4933950', '413646', '471521', '5484695', '40641744', '158390675', '40953199', '1517914', '182009091', '41687111', '6286851', '40583282', '392322', '1603527', '40583282', '6385329', '6788174', '179759213', '51142829', '1599697', '184039570', '41851753', '4885544', '184940275', '2433952', '1601702', '157228652', '4566735', '381150', '186438970', '2502218', '5407454', '165504380', '394362', '434254', '5422538', '400710', '5901906', '165956948', '42034822', '5359690', '177704539', '6166011', '42840877', '4014539', '4020573', '5112931', '175494616', '2409981', '1602899', '394974', '4925454', '6327569', '160325806', '5593588', '1598520', '160742948', '5713014', '4422147', '160742948', '396256', '1603594', '52788970', '404627', '40015701', '184507254', '6260324', '440723', '165617320', '5425193', '382644', '98356856', '4683342', '4167238', '177547750', '42126967', '5619662', '179728975', '41231334', '4391348', '169289247', '41098323', '4203778', '41025644', '42847614', '51176262', '184551849', '1602244', '1583673', '184551849', '1583673', '41190225', '184935813', '5008330', '3245331', '6871974', '5630714', '50740916', '181445849', '5082588', '380650', '79049718', '42616400', '493360', '456940', '4059505', '4894348', '185874332', '6714750', '384041', '177890785', '5668254', '5643438', '427369', '6234044', '42620865', '178142412', '4574190', '5731695', '178142412', '1546450', '41161322', '5624590', '2746468', '41443819', '6089922', '2464288', '5634821', '181063104', '6569177', '6217302', '164492779', '6612522', '6489828', '165945429', '6489828', '5415000', '90606952', '2410561', '5579759', '185880323', '393534', '41467372', '168874287', '6333506', '6278926', '109376707', '41752464', '6350296', '182124357', '6855003', '5367163', '182080859', '42950352', '40986310', '184883470', '6282871', '6141176', '182405136', '1602849', '441352', '182075881', '2434344', '41083139', '184203949', '50239972', '5395635', '6756189', '40673361', '42236961', '166789765', '41580094', '419475', '179147394', '2442689', '1601726', '91630294', '41777465', '52552578', '91630294', '6140468', '4653612', '178067203', '6754965', '1593761', '181420293', '1600092', '6664726', '180582546', '40131106', '5999190', '179379283', '51426143', '52994678', '179978374', '42268110', '391639', '184421889', '2748557', '3262434', '182362067', '50724006', '40643472', '182539925', '42953810', '4559163', '185303320', '1551279', '6280767', '156618249', '52014123', '6532303', '1579137', '42880684', '4952193', '183981936', '4708795', '40043172', '185097552', '398743', '4999958', '182054300', '4098956', '5388194', '41801478', '6473694', '2722501', '4794557', '41024883', '498903', '4418063', '67018417', '4310012', '400485', '4085544', '5329163', '41552017', '40684781', '4447651', '381896', '6302800', '40298722', '6510031', '6342374', '5037447', '5688980', '5659118', '1601795', '5790338', '6573363', '499391', '1599116', '427788', '5041699', '1595335', '51530812', '1601525', '4468019', '52257544', '41514323', '5157670', '4640188', '4081570', '1602234', '2422442', '97452502', '50119725', '5063379', '4179791', '2427707', '1601909', '150569553', '52744118', '4495127', '51170363', '6821689', '2463383', '4677820', '440252', '4988187', '5110178', '6031336', '6629244', '6010631', '50405109', '52716145', '3304802', '5394337', '41013147', '40100499', '6537613', '4532157', '413609', '4177239', '6568103', '415306', '133527106', '5701239', '5138260', '6297691', '40936169', '380955', '42626868', '4290868', '1604042', '4860977', '51748396', '42303', '6384104', '42191206', '41953861', '384060', '4847947', '40263521', '467071', '1600177', '2492015', '2548926', '150044464', '447877', '6043616', '41340175', '414834', '40184188', '5396841', '40644437', '52163162', '40018031', '6969562', '50576755', '42984568', '416442', '114894811', '93538066', '6822127', '2693925', '66836', '5768112', '50794717', '4313212', '6117865', '4132537', '96220733', '2735261', '2568828', '5419677', '4826730', '1602541', '5575795', '403210', '3311762', '454272', '5707736', '42157789', '1560968', '5488712', '41813171', '4840937', '2572772', '5510351', '65783958', '41648539', '1493361', '484619', '428979', '5296487', '394685', '40406492', '401274', '1493375', '2561971', '41248498', '42850157', '41208296', '380786', '4238818', '1493270', '42948501', '409450', '5400003', '5850234', '4568051', '4411136', '41320801', '2379108', '70363939', '5468016', '5240387', '5351247', '6415581', '41593655', '63903865', '6194898', '40483', '2519033', '6119459', '380847', '4397347', '6603552', '6158206', '4600991', '51805866', '4678131', '5880477', '41713662', '380865', '1584738', '6438438', '4088567', '6334713', '41388925', '442056', '6931213', '449458', '1457254', '2573137', '1467833', '1602106', '392597', '52323161', '381185', '50732671', '433448', '5712615', '67154337', '383126', '1597731', '420020', '1125173', '66675238', '6836298', '380949', '40160872', '6262482', '6080194', '2433767', '4770747', '405752', '4826390', '64594426', '6868759', '1601345', '497066', '1583783', '5891292', '4644728', '42417822', '5968605', '6641244', '40139259', '1046917', '491976', '6213251', '440630', '1597611', '2411509', '491569', '52406808', '41762715', '446128', '2579469', '5985688', '4733289', '50062847', '41869886', '6569086', '4707970', '1599694', '41809802', '4905659', '92515', '6073858', '381508', '52583242', '52755093', '440433', '52714931', '60925663', '1601606', '414210', '400041', '6390391', '1576383', '474113', '42598882', '6846489', '442506', '5392878', '6369624', '42645591', '42146686', '2570224', '6615976', '381193', '6131124', '431128', '4870497', '6281279', '62249947', '5654350', '2574722', '3317019', '6088993', '4033612', '61819803', '52193322', '41499475', '452540', '41930893', '41378282', '383118', '4401528', '1602738', '40928677', '41783095', '6161536', '6774991', '5124448', '1603944', '484791', '2757970', '1557292', '5734596', '383317', '480766', '6309304', '4382262', '3308721', '2456004', '6997121', '6764886', '1583680', '4805726', '31018', '490963', '40992086', '2465399', '1544326', '484450', '40800272', '423616', '66239240', '58912', '450283', '4684057', '4122235', '40652826', '2568943', '1602648', '6516279', '41770792', '1549248', '5305035', '1560989', '5122483', '6747422', '4031921', '1596065', '480407', '6181799', '41295428', '5907425', '50013110', '61090505', '107603611', '4572838', '5381392', '4267990', '40212285', '42286437', '479765', '432451', '5122483', '6747422', '4031921', '1596065', '480407', '6181799', '41295428', '5907425', '50013110', '61090505', '107603611', '4572838', '5381392', '4267990', '40212285', '424826', '394607', '493330', '4455182', '4428619', '490774', '5773118', '4401950', '41262861', '99337377', '4061371', '381575', '40612670', '40096228', '41565040', '42467758', '50941669', '41679585', '4019851', '488188', '1046931', '5130573', '396518', '381229', '6405449', '446016', '6826791', '40571006', '1509339', '42630094', '42846624', '40192345', '42483502', '65935627', '5163520', '5693114', '2415296', '381726', '66686', '40121157', '6650010', '2432998', '52231948', '70340', '1147741', '492871', '2485163', '50265678', '40626220', '61998', '6383309', '1597883', '5455752', '1601929', '40127797', '468921', '40455965', '50261231', '6441003', '40186747', '398399', '41977511', '430070', '400355', '52231948', '70340', '1147741', '492871', '2485163', '50265678', '40626220', '61998', '6383309', '1597883', '5455752', '1601929', '40127797', '468921', '40455965', '41727574', '3290010', '40386534', '499833', '42928275', '2677193', '1508156', '384014', '1549381', '6537884', '6402821', '1506200', '4010867', '493405', '5798596', '40810149', '51618621', '42308393', '6129369', '50327077', '1600205', '40831450', '4609557', '6010688', '50531223', '42143903', '4537122', '421957', '400295', '1580185', '412628', '52490365', '4472776', '41858633', '50208048', '481119', '52364847', '40217449', '49485', '6809822', '43339', '42737397', '41871789', '41386266', '5792231', '1587551', '4766245', '416200', '66714057', '4766970', '3280872', '50061785', '5719423', '383043', '5626602', '5839875', '41978807', '1475500', '4553359', '392494', '5780963', '5138841', '6686653', '61368502', '4088052', '27383', '49485', '6809822', '43339', '42737397', '41871789', '41386266', '5792231', '1587551', '4766245', '416200', '66714057', '4766970', '3280872', '50061785', '5719423', '3293508', '41226717', '393889', '5649351', '4354851', '4860752', '59005', '4982848', '40671138', '381703', '42926181', '78956', '382588', '490884', '3302843', '40672512', '415557']
    # url_list = ['170616710']
    json_list = []
    i = 1
    # 打开数据库连接
    db = pymysql.connect("localhost", "root", "ZHao979736", "dj_shop")

    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    for data in url_list:
        url = "https://www.meituan.com/meishi/"+data+"/"
        time.sleep(0.1)
        try:
            con = requests.get(url=url, headers=headers).content.decode('utf-8')
            str_data = re.findall('"detailInfo":(.+?),"photos"', con)[0]
            json_data = json.loads(str_data)
            name = json_data['name']
            address = json_data['address']
            phone = json_data['phone']
            type_name = re.findall('meishi/"},{"title":"(.+?)"', con)[0]
            #提取固话/手机号
            if phone[0]=='0':
                tel = phone
            else:
                tel = None
            try:
                phone = re.findall(r"1\d{10}", phone)[0]
            except:
                phone = None

            save_data(db, cursor, i, name, type_name, address, phone, tel)
            time.sleep(0.1)
        except Exception:
            pass
        print('程序执行%d次'%i)
        i = i+1

    print(json_list)
    # 关闭数据库连接

    db.close()
    print('关闭了')

def save_data(db,cursor,i, name, type_name, address, phone, tel):
    '''
    保存数据到数据库中
    :return:
    '''
    # SQL 插入语句
    sql = "INSERT INTO `tb_dianping` VALUES (%d, '%s', '%s', '%s', '%s', '%s');" % (
    i, name, type_name, address, phone, tel)
    try:
        # 执行sql语句
        cursor.execute(sql)
        # 执行sql语句
        db.commit()
        print('写入成功')
    except:
        # 发生错误时回滚
        db.rollback()


if __name__ == '__main__':
    # get_dateil_url()#获取商户列表
    detail_data()#获取商户详情
  1. git 地址?
    https://gitee.com/za_tan/codes/f79k4dujg5exywp3nctsz68

内含 / 上海/杭州/南京 /商家信息3.4万条。

你可能感兴趣的:(网络爬虫)