python爬取仅学习)

首先看效果图,目前做到爬取一页20条数据,可以根据特定的进行搜索,如下图所示:

python爬取仅学习)_第1张图片

接下来是代码实现:

import requests
from lxml import html
import json
url='https://i.meituan.com/s/a'

headers={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection":"keep-alive",
"Cookie":"__mta=107221821.1693728493255.1693743286835.1693743294640.16; _lxsdk_cuid=18a59ed9572c8-0c76d8af0d04-26031f51-1fa400-18a59ed9572c8; iuuid=2372D98DE5B706829EDD20A81DABB9F703852FA445F300D5B57D3E0F51085247; ci=1; cityname=%E5%8C%97%E4%BA%AC; webp=1; pinyin=beijing; WEBDFPID=7u50278909535wy4yv54253yz145381z40u95y8297958u1u6v6w3-2009086034186-1693726033680AQOIUAEfd79fef3d01d5e9aadc18ccd4d0c95072259; _lxsdk=2372D98DE5B706829EDD20A81DABB9F703852FA445F300D5B57D3E0F51085247; uuid=e3c3261c4d11488388a0.1693726046.1.0.0; wm_order_channel=default; utm_source=; _ga=GA1.1.391105884.1693726503; mtcdn=K; mt_c_token=AgG0I9bbdMqeuwXBu1jZujVkSHkG6VeK78Pj1yi64t_BE24Bn8yrOyF4Bn_ri8GQrh4ghWG-PkOH-wAAAACPGgAADPhz8Qmnx8ETbtU8dnfHQMx4BhwYc1Ma9Jtc7zm7O-7vZ_Ki3H2UKFhy9vow97yV; _ga_95GX0SH5GM=GS1.1.16937265031.1693728316.0.0.0; isid=AgG0I9bbdMqeuwXBu1jZujVkSHkG6VeK78Pj1yi64t_BE24Bn8yrOyF4Bn_ri8GQrh4ghWG-PkOH-wAAAACPGgAADPhz8Qmnx8ETbtU8dnfHQMx4BhwYc1Ma9Jtc7zm7O-7vZ_Ki3H2UKFhy9vow97yV; logintype=normal; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; __utmz=74597006.1693728461.2.2.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _hc.v=ddd3117a-4df8-c480-e7f1-5ab835097f3f.1693728506; p_token=AgG0I9bbdMqeuwXBu1jZujVkSHkG6VeK78Pj1yi64t_BE24Bn8yrOyF4Bn_ri8GQrh4ghWG-PkOH-wAAAACPGgAADPhz8Qmnx8ETbtU8dnfHQMx4BhwYc1Ma9Jtc7zm7O-7vZ_Ki3H2UKFhy9vow97yV; ci3=1; latlng=40.309937%2C116.63559; JSESSIONID=node05w62e2pidln928zkqwwab1n49429605.node0; IJSESSIONID=node05w62e2pidln928zkqwwab1n49429605; oops=AgG0I9bbdMqeuwXBu1jZujVkSHkG6VeK78Pj1yi64t_BE24Bn8yrOyF4Bn_ri8GQrh4ghWG-PkOH-wAAAACPGgAADPhz8Qmnx8ETbtU8dnfHQMx4BhwYc1Ma9Jtc7zm7O-7vZ_Ki3H2UKFhy9vow97yV; u=1825036795; idau=1; __utma=74597006.1419997370.1693726028.1693732847.1693743287.4; __utmc=74597006; logan_session_token=tllimadewmhj46tfe4zl; _lxsdk_s=18a5af4f676-045-31-02%7C%7C113; __utmb=74597006.2.10.1693743287; i_extend=C_b0E074737795478104405471777368049152848393_e3346570330234811952_a%e5%9b%a2%e5%9c%86%e5%81%87%e6%97%a5%e9%a5%ad%e5%ba%97H__a",
"Host":"i.meituan.com",
# "Referer":"https://meishi.meituan.com/",
"Sec-Ch-Ua":'"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
"Sec-Ch-Ua-Mobile":"?0",
"Sec-Ch-Ua-Platform":"Windows",
"Sec-Fetch-Dest":"document",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"none",
"Sec-Fetch-User":"?1",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
}

w=input("请输入你要查询的关键字:")

data={
    "w":w
}

print(data)


response=requests.get(url=url,headers=headers,params=data)

response.encoding='utf-8'
# content=response.text
# print(content)
# 使用lxml解析HTML内容
tree = html.fromstring(response.text)

for i in range(20):
    i=i+1
    new_url=tree.xpath('/html/body/dl/dd[1]/dl['+str(i)+']/dd[1]/a/@href')[0]
    url = new_url

    print(url)
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Connection": "keep-alive",
        "Cookie": "__mta=174305565.1693750373607.1693750476352.1693750489385.3; client-id=1c77abaf-06a4-4db9-b005-cba2226c85c8; iuuid=40EDD2F761C8DF5F307078A2F308376CF8DC5F71F79DEC8D9B7A7AFE08C40A1F; _lxsdk_cuid=18a5a1cacdcc8-0bf8ef3cdff049-26031f51-1fa400-18a5a1cacdcc8; _lxsdk=40EDD2F761C8DF5F307078A2F308376CF8DC5F71F79DEC8D9B7A7AFE08C40A1F; WEBDFPID=uu9001z1zx415z6w0032787w891u122v81z4z8xzu6897958749u6yx9-2009088943380-1693728940544MSAQKKCfd79fe1d5e9aadc18ccd4d0c95074000; uuid=df08b000fe14442aadad.1693728945.1.0.0; ci=1; cityname=%E5%8C%97%E4%BA%AC; token=AgHyI6cURXbNzV1XhMvgjgGI_fiUblqLlyI_dOGBrvaje7CuqU-Xn3jCfsJsDWhX0RQx96kUltRvAACPGgAAGr1EWneA3DZ-mCl89c4GrVOGqki1yUtG2Jj8JJl6wX1bhWtpqoPKgu_u0WuLAgzm; mt_c_token=AgHyI6cURXbNzV1XhMvgjgGI_fiUblqLlyI_dOGBrvaje7CuqU-Xn3jCfsJsDWhX0RQx96kUltRvjgAAAACPGgAAGr1EWneA3DZ-mCl89c4GrVOGqki1yUtG2Jj8JJl6wX1bhWtpqoPKgu_u0WuLAgzm; userId=1825036795; isid=AgHyI6cURXbNzV1XhMvgjgGI_fiUblqLlyI_dOGBrvaje7CuqU-Xn3jCfsJsDWhX0RQx96kUltRvjgAAAACPGgAAGr1EWneA3DZ-mCl89c4GrVOGqki1yUtG2Jj8JJl6wX1bhWtpqoPKgu_u0WuLAgzm; logintype=normal; webp=1; i_extend=H__a100005__b1; __utma=74597006.907694184.1693744870.1693744870.1693744870.1; __utmz=74597006.1693744870.1.1.utmcsr=passport.meituan.com|utmccn=(referral)|utmcmd=referral|utmcct=/; latlng=40.334445,116.627097,1693744871370; p_token=AgHyI6cURXbNzV1XhMvgjgGI_fiUblqLlyI_dOGBrvaje7CuqU-Xn3jCfsJsDWhX0RQx96kUltRvjgAAAACPGgAAGr1EWneA3DZ-mCl89c4GrVOGqki1yUtG2Jj8JJl6wX1bhWtpqoPKgu_u0WuLAgzm; _hc.v=07bc6f3d-35b1-d6e3-24d1-826b22665d60.1693750372; _lx_utm=utm_source%3Dpassport.meituan.com%26utm_medium%3Dreferral%26utm_content%3D%252F; logan_session_token=o34z7k2v61npfzvx79ih; _lxsdk_s=18a5b554bff-f02-4b1-d16%7C%7C6",
        "Host": "meishi.meituan.com",
        # "Referer":"https://meishi.meituan.com/",
        "Sec-Ch-Ua": '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": "Windows",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
    }
    # proxy = {
    #     'https': '121.14.20.130:443'
    # }
    # url  请求资源路径
    # params 参数
    # kwargs 字典
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    # 使用lxml解析HTML内容
    tree1 = html.fromstring(response.text)
    # 使用XPath来提取数据
    iphone = tree1.xpath('/html/body/script[14]/text()')[0]
    json_data = iphone.split('"poiInfo":')[1].split(',"crawlerMeta":')[0]
    # print(json_data)
    data = json.loads(json_data)
    phone = data["phone"]
    name = data["name"]
    # 打印提取的数据
    print("店铺名字:", name)
    print("手机号:", phone)
    # print(new_url)

你可能感兴趣的:(python学习之路,学习,python,爬虫,网络爬虫)