超全的python爬虫练手项目汇总

文章目录

    • 1. 爬取高德地图数据
    • 2. 有道翻译 js加密
    • 3. 朗朗渔家
    • 4. 猫眼电影
    • 5. 爬取房王网的数据
    • 6.华夏基金
    • 7. 糗事百科
    • 8. 爬取百姓网数据
    • 9. 爬取房天下数据
    • 10. 爬取豆瓣电影排名数据
    • 11. 爬取网易云数据
    • 12. 爬取链家租房信息数据

1. 爬取高德地图数据

import requests

'''
北京接口:https://www.amap.com/service/weather?adcode=110000
天津接口:https://www.amap.com/service/weather?adcode=120000

adcode接口:https://www.amap.com/service/cityList?version=201951410
'''
class Gao(object):

    def __init__(self):
        self.run()

    def run(self):

        # 声明一个base_url
        base_url = "https://www.amap.com/service/weather?adcode="

        # 获取所有城市adcode
        adcode_list = self.get_adcode()
        # print(adcode_list)

        # for循环adcode 去获取城市天气信息
        for c, adcode_dict in enumerate(adcode_list, 1):
            # 获取adcode 用于拼接完整的url
            adcode = adcode_dict["adcode"]

            # 拼接完整的url
            full_url = base_url + adcode
            # print(full_url)

            # 发情请求 获取天气json数据
            response = requests.get(full_url)
            json_data = response.json()
            # print(json_data)
            # 第一种方式
            # msg = json_data.get("data").get("message")
            # if msg == "Successful.":
            #     # 获取天气信息
            #     weather_name = json_data.get("data").get("data")[0].get("live").get("weather_name")
            #     # print(weather_name)
            #
            #     # 将天气信息 加入到adcode_dict中
            #     adcode_dict["weather_name"] = weather_name
            #     print(c, adcode_dict)
            # else:
            #     print(msg)

            # 第二种方式

            try:
                # 获取天气信息
                weather_name = json_data.get("data").get("data")[0].get("live").get("weather_name")
                # print(weather_name)

                # 将天气信息 加入到adcode_dict中
                adcode_dict["weather_name"] = weather_name
                print(c, adcode_dict)
            except Exception as e:
                print(e)

    # 获取所有城市adcode
    def get_adcode(self):
        # 定义adcode接口
        base_url = "https://www.amap.com/service/cityList?version=201951410"
        # 发起请求
        response = requests.get(base_url)
        # print(response.json())
        # print(response.text)

        # 获取json数据
        json_data = response.json()

        # 获取adcode列表
        city_by_letter = json_data.get("data").get("cityByLetter")
        # print(city_by_letter)

        # 声明一个列表 放所有的城市字典
        city_list = []
        # 循环遍历字典中的值

        # for city_list1 in city_by_letter.values():
        #     # print(city_list1)
        #     # 第一种方式
        #     for city_dict in city_list1:
        #         print(self.count, city_dict)
        #         city_list.append(city_dict)
        #         self.count += 1
        # 循环遍历字典中的值
        for city_list1 in city_by_letter.values():
            # 第二种方式
            city_list += city_list1
        print(city_list)
        # print(len(city_list))
        # 所有存放城市字典的列表
        return city_list


if __name__ == '__main__':
    Gao()

'''
总结:
列表相加 可以使用+=号, city_list += city_list1
判断某个信息 时候成功获取到,接口返回的信息 会告诉你,只有在获取到信息之后 
我们才对指定 信息进行获取 ,如果不这样的话 会报错,影响代码正常运行,
此时应该写异常处理
'''

2. 有道翻译 js加密

import requests
import time
import random
import hashlib
def md5(value):
    # 创建MD5对象
    md5_obj = hashlib.md5()
    # 加密字符串
    md5_obj.update(bytes(value, encoding="utf-8"))
    # 进行16位的加密
    sign = md5_obj.hexdigest()
    return sign
def youdao(i):
    # 获取salt
    salt = str(int(time.time() * 1000)) + str(random.randint(0, 9))
    # print(salt)
    # 获取sign
    sign1 = "fanyideskweb" + i + salt + "@6f#X3=cCuncYssPsuRUE"
    sign = md5(sign1)
    # 定义data参数
    data = {
        "i": i,
        # "from": "AUTO",
        # "to": "AUTO",
        # "smartresult": "dict",
        "client": "fanyideskweb",
        "salt": salt,
        "sign": sign,
        # "ts": "1558514897639",
        # "bv": "cf156b581152bd0b259b90070b1120e6",
        # "doctype": "json",
        # "version": "2.1",
        "keyfrom": "fanyi.web",
        # "action": "FY_BY_REALTlME"
    }

    # 加上请求头 浏览器信息
    headers = {
        # "Accept": "application/json, text/javascript, */*; q=0.01",
        # "Accept-Encoding": "gzip, deflate",
        # "Accept-Language": "zh-CN,zh;q=0.9",
        # "Connection": "keep-alive",
        # "Content-Length": "238",
        # "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Cookie": "[email protected]; OUTFOX_SEARCH_USER_ID_NCOO=1844201936.6123636; _ga=GA1.2.1939912746.1552966532; JSESSIONID=aaaB9UfpkFL02gnEynoRw; ___rl__test__cookies=1558514897636",
        # "Host": "fanyi.youdao.com",
        # "Origin": "http://fanyi.youdao.com",
        "Referer": "http://fanyi.youdao.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
        # "X-Requested-With": "XMLHttpRequest"
    }

    # 定义起始url
    base_url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"

    # 发送请求
    response = requests.post(base_url, data=data, headers=headers)
    # 获取response里面的json数据
    json_data = response.json()
    print(json_data)
    print(type(json_data))


if __name__ == '__main__':
    i = input("请输入需要翻译的内容:")
    # i = "banana"
    youdao(i)

"""
遇到的问题1:
""只携带参数data 发起请求的时候,请求不到数据,出现{'errorCode'"": ""50},",",
此时的解决方案是:加上请求头浏览器信息 再次发起请求 
""问题2:还是获取不到信息 {'errorCode'"": ""50}",",
解决的方案是:把所有的请求头信息添加到headers中
"""

# i: banana
# client: fanyideskweb
# salt: 15585168560444
# sign: da50e3193cda496e1455ff28c1bb21b1
# keyfrom: fanyi.web
#
# i: apple
# "client": "fanyideskweb",
# "salt": "15585148976393",
# "sign": "147950af9758d1e79aeaacd4ff27d14d",
# "keyfrom": "fanyi.web",
#
#
# salt:  需要看一下 是否需要加密
# sign: 也要看一下是否需要加密

# 首要解决的问题是salt和sign生成的过程
'''
salt = o.salt = i = r + parseInt(10 * Math.random(), 10)
"" + (new Date).getTime() + parseInt(10 * Math.random(), 10)  js
= "" + int(time.time() * 1000) + random.randint(0, 9)

o = r.generateSaltSign(t) = r(t)
r.generateSaltSign(t) = t.generateSaltSign(t) = r(t)
{
    ts: r,
    bv: t,
    salt: i,
    sign: n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
}


e = t = "apple" 需要翻译的内容

var r = function(e)
    {
            var
        t = n.md5(navigator.appVersion),
            r = "" + (new Date).getTime(),
                i = r + parseInt(10 * Math.random(), 10);
        return {
            ts: r,
            bv: t,
            salt: i,
            sign: n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
    }
};


长度比较:
15585816225096   python
15585148976393   js
15585822104216


sign = o.sign
= n.md5("fanyideskweb" + e + i + "@6f#X3=cCuncYssPsuRUE")
= md5("fanyideskweb" + "apple" + salt + "@6f#X3=cCuncYssPsuRUE")
'''

3. 朗朗渔家

import re
import requests


class Lang:

    def __init__(self):
        self.run()

    def run(self):
        # 获取页面信息
        base_url = "http://www.langlang2017.com/"
        response = requests.get(base_url)
        html = response.content.decode("utf-8")
        print(html)
        self.get_data(html)

    def get_data(self, html):
        # 缩小范围
        pattern1 = re.compile('')
        result1 = pattern1.search(html).group()
        # print(result1)

        # 获取alt信息  只返回括号中的内容 括号外面的不返回
        alt_list = re.findall('alt="(.*?)"', result1)
        # print(alt)

        # 获取src信息
        src_list1 = re.findall('src="(.*?)"', result1)
        # print(src_list1)
        # 获取完整的src图片链接
        src_list = []
        for s in src_list1:
            src = "http://www.langlang2017.com/" + s
            # print(src)
            src_list.append(src)
        # print(src_list)

        # 提示:从html里面获取
        # 获取电话号码
        phone1 = re.findall('
联系电话:(\d{11})
', html)[0] # phone1 = re.search('1\d{10}', html) # group(1)选择匹配到的第一个括号的内容 # phone = phone1.group() print(phone1) # 获取所有的http连接 有两个 http_list = re.findall('"(http.*?)"', html) # print(http_list) # 获取地址 address = re.search('
地址:(.*?)
', html) address = address.group(1) lang_dict = { "alt_list": alt_list, "src_list": src_list, # "phone": phone, "http_list": http_list, "address": address } print(lang_dict) import json print(json.dumps(lang_dict)) if __name__ == '__main__': Lang()

4. 猫眼电影

import requests
import re


class Mao:
    def __init__(self):
        '''
        本质是初始化一些条件,并不是调用其他函数
        当前类实例化的时候触发
        '''
        self.count = 1
        self.spider_name = "万能爬虫"
        # self.run()

    def __call__(self, *args, **kwargs):
        '''
        当前类的对象当做函数使用的时候触发
        '''
        self.run()

    def run(self):
        # 获取猫眼的html信息 字符串信息
        base_url = "https://maoyan.com/board"
        response = requests.get(base_url)
        html = response.text
        # print(html)
        self.get_data(html)

    def get_data(self, html):
        # 缩小范围 获取电影dd
        # 换行不能用.*? 要是用\s\S   re.S 可以是\n 换行变为普通字符 .就可以匹配到\n
        dd_list = re.findall('
.*?
', html, re.S) # print(dd_list) # print(dd_list[0]) # print(len(dd_list)) # import json # print(json.dumps(dd_list)) # 循环获取dd中的电影信息 for dd in dd_list: # print(dd) # 获取排名 rank = re.findall('(\d+)', dd)[0] # print(rank) # 获取电影名称 name = re.findall('title="(.*?)" class', dd)[0] # print(name) # 获取主演信息 actor = re.findall('

([\d\D]*?)

', dd)[0] if "主演" not in actor: actor = "" else: # 去掉前面和后面的空白 actor = actor.strip() # print(actor) # 上映日期 publish_date = re.findall('

上映时间:(.*?)

', dd)[0] # print(publish_date) # 评分信息 score_match = re.search('

(.*?)(\d)

', dd) # print(score.group(1)) # print(score.group(2)) score = score_match.group(1) + score_match.group(2) # print(score) # 获取图片 # 浏览器获取到的信息 和代码获取到的信息 有时候不一样 # 写正则表达式的时候 以代码获取到的字符串 为准 # 在获取图片的过程当中 优先获取大图 pic = re.findall('data-src="(.*?)@160w_220h_1e_1c"', dd)[0] # print(pic) # 将电影信息 存入字典中 movie_dict = { "rank": rank, "name": name, "actor": actor, "publish_date": publish_date, "score": score, "pic": pic } print(movie_dict) if __name__ == '__main__': mao = Mao() mao()

5. 爬取房王网的数据

import re
import requests

class Fang:

    def __init__(self):
        self.count = 1

    def __call__(self, *args, **kwargs):
        self.get_max_page()

    # 获取最大页码
    def get_max_page(self):
        base_url = "http://gz.ihk.cn/myxf/houselist/?mark=gzxf089"
        html = self.get_html(base_url)

        max_page_list = re.findall('', html)
        # print(div_list)
        # print(div_list[0])
        # print(len(div_list))
        for div in div_list:
            # 获取图片
            pic = re.findall('data-original="(.*?)"', div)[0]
            # print(pic)

            # 获取新房名称
            name = re.findall('(.*?)', div)[0]
            # print(name)

            # 描述信息
            desc = re.findall('
[\s\S]*?([\w\W]*?)', div)[0] # print(desc) # 主力户型 house_type = re.findall('
(.*?)', div)[0] # print(house_type) # 地址 address = re.findall('(.*?)', div)[0].strip() # print(address) # 标签 sign = re.findall('(.*?)', div) # print(sign) # 价格 price = re.findall('
  • (.*?)', div)[0] # print(price) fang_dict = { "pic": pic, "name": name, "house_type": house_type, "desc": desc, "address": address, "sign": sign, "price": price } print(self.count, fang_dict) self.count += 1 # 获取指定url的页面 def get_html(self, base_url): response = requests.get(base_url) html = response.text # print(html) return html if __name__ == '__main__': fang = Fang() fang()
  • 6.华夏基金

    import requests
    import re
    
    class Hua:
    
        def __init__(self):
            pass
    
        def __call__(self, *args, **kwargs):
            # self.get_html()
            self.get_data()
    
        def get_html(self):
            base_url = "http://fund.chinaamc.com/portal/cn/include/newproducthome.jsp"
            response = requests.get(base_url)
            html = response.text
            # print(html)
    
            with open("hua.html", "w", encoding="utf-8") as f:
                f.write(html)
    
        def get_data(self):
            with open("hua.html", "r", encoding="utf-8") as f:
                html = f.read()
            # print(html)
            # print(type(html))
            table_list = re.findall('[\s\S]*?', html)
            # print(table_list)
            # print(table_list[0])
            # print(len(table_list))
            #
            # import json
            # print(json.dumps(table_list))
            for c, table in enumerate(table_list):
                if c == 0:
                    print("==========")
                    # 缩小范围 后去tr列表
                    self.tr_list = self.table_handler(table)
    
                    # print(tr_list)
                    # print(tr_list[0])
                    # print(len(tr_list))
                    for co, tr in enumerate(self.tr_list, 1):
                        # 获取基金名称
                        name_fund_list = self.tr_hander(tr)
                        fund_list = name_fund_list[1]
    
                        fund_dict = {
                            "name": name_fund_list[0],
                            "code": fund_list[1].strip(),
                            "nw_date": fund_list[2],
                            "net_worth": fund_list[3],
                            "cum_worth": fund_list[4],
                            "price_limit": "" if fund_list[5] == "---" else fund_list[5],
                            "set_up_date": fund_list[6],
                            "purchase_status": fund_list[7],
                            "redemption_status": fund_list[8],
                            "cast_surely_status": fund_list[9],
                        }
                        print(co, fund_dict)
    
                        # break
    
                elif c == 1:
                    print("============================")
                    # 缩小范围 获取tr
                    self.tr_list = self.table_handler(table)
                    # print(tr_list)
                    # print(tr_list[0])
                    # print(len(tr_list))
    
                    for co, tr in enumerate(self.tr_list, 1):
                        # 获取基金名称
                        name_fund_list = self.tr_hander(tr)
                        fund_list = name_fund_list[1]
    
                        fund_dict = {
                            "name": name_fund_list[0],
                            "code": fund_list[2].strip(),
                            "nw_date": fund_list[3],
                            "million_return": fund_list[4],
                            "seven_day_annualized_yield": fund_list[5],
                            "aror30": fund_list[6],
                            "the_year_aror": fund_list[7],
                            "set_up_date": fund_list[8],
                            "purchase_status": fund_list[9],
                            "redemption_status": fund_list[10],
                            "cast_surely_status": fund_list[11]
                        }
                        print(co, fund_dict)
    
                elif c == 2:
                    print("===================================")
                    # 缩小范围 获取tr
                    self.tr_list = self.table_handler(table)
                    # print(len(tr_list))
                    # print(tr_list)
                    for co, tr in enumerate(self.tr_list, 1):
                        # 获取基金名称
                        name_fund_list = self.tr_hander(tr)
                        fund_list = name_fund_list[1]
    
                        fund_dict = {
                            "name": name_fund_list[0],
                            "code": fund_list[2].strip(),
                            "nw_date": fund_list[3],
                            "thousands_return": fund_list[4],
                            "seven_day_annualized_yield": fund_list[5],
                            "operation_period": "",
                            "set_up_date": fund_list[6],
                            "purchase_status": fund_list[7],
                            "redemption_status": fund_list[8],
                            "cast_surely_status": fund_list[9]
                        }
                        print(co, fund_dict)
    
                else:
                    print("=============================")
                    # 缩小范围 获取tr
                    self.tr_list = self.table_handler(table)
                    # print(tr_list)
                    # print(len(tr_list))
    
                    for co, tr in enumerate(self.tr_list, 1):
    
                        # 获取name
                        name_fund_list = self.tr_hander(tr)
                        fund_list = name_fund_list[1]
    
                        fund_dict = {
                            "name": name_fund_list[0],
                            "code": fund_list[2].strip(),
                            "nw_date": fund_list[3],
                            "net_worth": "" if fund_list[4] == "--" else fund_list[4],
                            "cum_worth": "" if fund_list[5] == "--" else fund_list[5],
                            "set_up_date": fund_list[6],
                            "due_date": fund_list[7],
                            "cast_surely_status": "" if fund_list[8] == "---" else fund_list[8],
                            "trade_status": fund_list[9],
                        }
                        print(co, fund_dict)
    
        # table中获取tr 缩小范围
        def table_handler(self, table):
            tr_list = re.findall('[\s\S]*?', table)
            del tr_list[0]
            return tr_list
    
        def tr_hander(self, tr):
            name = re.search('title="(.*?)"', tr).group(1)
            fund_list = re.findall('(.*?)', tr)
            return name, fund_list
    
    
    if __name__ == '__main__':
        hua = Hua()
        hua()
    
    

    7. 糗事百科

    import requests
    from lxml import etree
    from fake_useragent import UserAgent
    
    
    class Qiu:
    
        def __init__(self):
            self.count = 1
    
        def __call__(self, *args, **kwargs):
            self.get_max_page()
    
        def get_max_page(self):
            base_url = "https://www.qiushibaike.com/8hr/page/2/"
            html_xml = self.get_html(base_url)
            # 获取最大页码
            max_page = int(html_xml.xpath("//a/span[@class='page-numbers']/text()")[-1].strip())
            # print(max_page)
            self.get_data(max_page)
    
        def get_data(self, max_page):
    
            for page in range(1, max_page + 1):
                print("===================第{}页开始下载=========================".format(page))
                page_url = "https://www.qiushibaike.com/8hr/page/{}/".format(page)
                # print(page_url)
                html_xml = self.get_html(page_url)
                # 缩小范围
                li_list = html_xml.xpath("//li[contains(@id, 'qiushi_tag_')]")
                # print(len(li_list))
    
                for li in li_list:
                    # 获取图片
                    pic = li.xpath(".//a[contains(@class, 'recmd-left')]/img/@src")[0]
                    # if "/w/150/h/112" in pic:
                    #     pic = "https:" + pic[:-12]
                    # else:
                    #     pic = ""
    
                    # 三元表达式 实现上面的代码
                    pic = "https:" + pic[:-12] if "/w/150/h/112" in pic else ""
                    # print(pic)
    
                    # 获取昵称
                    nike_name = li.xpath(".//span[@class='recmd-name']/text()")[0]
                    # print(nike_name)
    
                    # 获取内容
                    content = li.xpath(".//a[@class='recmd-content']/text()")
                    content = content[0] if content else ""
                    # print(content)
    
                    # 获取好笑数量
                    laught_num = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0]
                    # if "万" in laught_num:
                    #     laught_num = int(float(laught_num[:-1]) * 10000)
                    # else:
                    #     laught_num = int(laught_num)
                    laught_num = int(float(laught_num[:-1]) * 10000) if "万" in laught_num else int(laught_num)
                    # print(laught_num)
    
                    # 评论数量
                    comment_num = li.xpath(".//div[@class='recmd-num']/span[4]/text()")
                    comment_num = int(comment_num[0]) if comment_num else 0
                    # print(comment_num)
    
                    qiu_dict = {
                        "pic": pic,
                        "nike_name": nike_name,
                        "content": content,
                        "laught_num": laught_num,
                        "comment_num": comment_num,
    
                    }
                    print(self.count, qiu_dict)
                    self.count += 1
    
    
        def get_html(self, base_url):
            # 随机产生一个浏览器信息
            headers = {"User-Agent": UserAgent().random}
            response = requests.get(base_url, headers=headers)
            html = response.text
            # print(html)
            html_xml = etree.HTML(html)
            return html_xml
    
    
    if __name__ == '__main__':
        qiu = Qiu()
        qiu()
    
    

    8. 爬取百姓网数据

    import requests
    from lxml import etree
    
    
    class Bai:
    
        def __init__(self):
            self.count = 1
    
        def __call__(self, *args, **kwargs):
            self.get_max_page()
    
        def get_max_page(self):
            base_url = "http://beijing.baixing.com/chongwujiaoyi/m177986/?entities=%E6%80%A7%E5%88%AB_%E5%85%AC&%E4%BB%B7%E6%A0%BC%5B0%5D=1000&%E4%BB%B7%E6%A0%BC%5B1%5D=1100&%E5%B9%B4%E9%BE%84%5B0%5D=0&%E5%B9%B4%E9%BE%84%5B1%5D=3"
            html_xml = self.get_html(base_url)
    
            # 获取最大页码
            max_page = int(html_xml.xpath("//ul[@class='list-pagination']/li[last()-1]/a/text()")[0])
            # print(max_page)
    
            # 获取数据
            self.get_data(max_page)
    
        def get_data(self, max_page):
            # 循环获取每一页的xml对象 并获取其中的指定的数据
            for page in range(1, max_page + 1):
                print("================第{}页开始下载======================".format(page))
                base_url = "http://beijing.baixing.com/chongwujiaoyi/m177986/?entities=%E6%80%A7%E5%88%AB_%E5%85%AC&page={}&%E4%BB%B7%E6%A0%BC%5B0%5D=1000&%E4%BB%B7%E6%A0%BC%5B1%5D=1100&%E5%B9%B4%E9%BE%84%5B0%5D=0&%E5%B9%B4%E9%BE%84%5B1%5D=3".format(page)
                # print(base_url)
                html_xml = self.get_html(base_url)
    
                # 缩小范围
                li_list = html_xml.xpath("//ul[@class='list-ad-items']/li[@data-aid]")
                # print(len(li_list))
    
                # 遍历获取每条狗的信息
                for co, li in enumerate(li_list, 1):
                    # 图片
                    pic = li.xpath(".//img/@src")[0]
                    if "http" not in pic:
                        pic = li.xpath(".//img/@data-originsource")
                        pic = pic[0] if pic else ""
                    # print(co, pic)
    
                    # 获取描述信息
                    desc = li.xpath(".//a[@class='ad-title']/text()")[0]
                    # print(co, desc)
    
                    # 获取地址信息
                    address = li.xpath(".//div/div[@class='ad-item-detail'][1]/text()")[0]
                    # print(address)
    
                    # 类型
                    dog_type = li.xpath(".//div/div[@class='ad-item-detail'][2]/text()")[0].strip()
                    dog_type = dog_type.replace(" ", "")
                    # print(dog_type)
    
                    # 获取价格
                    price = li.xpath(".//div/span/text()")[0]
                    # print(price)
    
                    dog_dict = {
                        "pic": pic,
                        "desc": desc,
                        "address": address,
                        "dog_type": dog_type,
                        "price": price,
                    }
                    print(self.count, dog_dict)
                    self.count += 1
    
        # 获取指定url对应的xml对象
        def get_html(self, url):
            response = requests.get(url)
            html = response.text
            # print(html)
            return etree.HTML(html)
    
    
    if __name__ == '__main__':
        bai = Bai()
        bai()
    
    

    9. 爬取房天下数据

    import requests
    from lxml import etree
    import re
    
    class Fang:
    
        def __init__(self):
            self.count = 1
    
        def __call__(self, *args, **kwargs):
            self.get_max_page()
    
        def get_max_page(self):
            base_url = "https://zu.fang.com/house/i3100/"
            html, html_xml = self.get_html(base_url)
            max_page = int(re.search('共(\d+)页', html).group(1))
            # print(max_page)
    
            # 通过url后去指定页面的数据
            self.get_data(max_page)
    
        def get_data(self, max_page):
    
            for page in range(1, max_page+1):
                print("=================第{}页开始下载======================".format(page))
                page_url = "https://zu.fang.com/house/i3{}/".format(page)
    
                # 获取分页URL得页面
                html, html_xml = self.get_html(page_url)
    
                # 缩小范围
                dl_list = html_xml.xpath("//div[@class='houseList']/dl[dt]")
                # print(len(dl_list))
    
                for co, dl in enumerate(dl_list, 1):
                    # 获取图片
                    pic = "https:" + dl.xpath(".//img/@data-src")[0]
                    pic = pic.replace('275x207', "1000x1000")
                    # print(co, pic)
    
                    # 标题
                    title = dl.xpath(".//a[@title]/@title")[0]
                    # print(co, title)
    
                    # 租房类型
                    rent_type = dl.xpath(".//dd/p[2]/text()[1]")[0].strip()
                    # print(rent_type)
    
                    # 室
                    fang_info = dl.xpath(".//dd/p[2]/text()[2]")[0]
                    # print(fang_info)
                    if "室" in fang_info:
                        room = re.findall('(\d+)室', fang_info)[0]
                    else:
                        room = ""
    
                    if "厅" in fang_info:
                        ting = re.findall("(\d+)厅",fang_info)[0]
                    else:
                        ting = ""
                    # print(co, room, ting)
    
                    # 面积
                    area = dl.xpath(".//dd/p[2]/text()[3]")[0]
                    area = area[:-2]
                    # print(area)
    
                    # 朝向
                    toward = dl.xpath(".//dd/p[2]/text()[4]")[0].strip()
                    # print(toward)
    
                    # 城区
                    city_area = dl.xpath(".//dd/p[3]/a[1]/span/text()")[0]
                    # print(city_area)
    
                    # 商圈
                    business_circle = dl.xpath(".//dd/p[3]/a[2]/span/text()")[0]
                    # print(business_circle)
    
                    # 小区
                    community = dl.xpath(".//dd/p[3]/a[3]/span/text()")
                    community = community[0] if community else ""
                    # print(community)
    
                    # 地址
                    address_list = dl.xpath(".//span[@class='note subInfor']//text()")
                    # print(address)
                    # 用空字符串 将列表中的元素 连接成一个字符串
                    address = "".join(address_list)
                    # print(address)
    
                    # 标签
                    sign_list = dl.xpath(".//dd/p[@class='mt12']/span/text()")
                    # print(sign_list)
    
                    # 价格
                    price = dl.xpath(".//span[@class='price']/text()")[0]
                    # print(price)
    
                    fang_dict = {
                        "pic": pic,
                        "title": title,
                        "rent_type": rent_type,
                        "room": room,
                        "ting": ting,
                        "area": area,
                        "toward": toward,
                        "city_area": city_area,
                        "business_circle": business_circle,
                        "community": community,
                        "address": address,
                        "sign_list": sign_list,
                        "price": price,
                    }
    
                    print(self.count, fang_dict)
                    self.count += 1
    
    
                # break
    
        # 获取指定url对应的xml对象
        def get_html(self, url):
            response = requests.get(url)
            html = response.text
            # print(html)
            # with open("2.html", "r", encoding="utf-8") as f:
            #     html = f.read()
            return html, etree.HTML(html)
    
    
    if __name__ == '__main__':
        fang = Fang()
        fang()
    
    

    10. 爬取豆瓣电影排名数据

    import requests
    from fake_useragent import UserAgent
    from selenium import webdriver
    from lxml import etree
    import time
    import re
    
    
    class Dou:
    
        def __init__(self):
            self.count = 1
    
        def __call__(self, *args, **kwargs):
            self.get_data()
    
        # 获取最大页码
        def get_data(self):
    
            page = 42
            while True:
                print(f"=================第{page+1}页开始下载===================")
                base_url = "https://book.douban.com/subject_search?search_text=python&cat=1001&start={}".format(page*15)
                html, html_xml = self.get_html(base_url)
                if "查询错误" in html:
                    break
                # print(base_url)
    
                # 缩小范围 获取每本书的大div
                div_list = html_xml.xpath("//div[@class='item-root']")
                # print(div_list)
                # print(len(div_list))
    
                # 循环获取每本书的详细信息
                for co, div in enumerate(div_list, 1):
                    # 获取图片
                    pic = div.xpath(".//img/@src")[0]
                    # print(co, pic)
    
                    # 获取书名
                    name = div.xpath(".//a[@class='title-text']/text()")[0]
                    # print(co, name)
    
                    # 评分
                    score = div.xpath(".//span[@class='rating_nums']/text()")
                    score = score[0] if score else ""
                    # print(score)
    
                    # 评价人数
                    comment_nums_str = div.xpath(".//span[@class='pl']/text()")
                    comment_nums_str = comment_nums_str[0] if comment_nums_str else ""
                    comment_nums = re.findall("\d+", comment_nums_str)
                    comment_nums = int(comment_nums[0]) if comment_nums else 0
                    # print(comment_nums)
    
                    # 获取出版社信息
                    desc_info = div.xpath(".//div[@class='meta abstract']/text()")
                    if desc_info:
                        desc_info = desc_info[0]
                        desc_info = desc_info.replace(" ", "")
                    else:
                        desc_info = ""
                    # print(desc_info)
    
                    book_dict = {
                        "pic": pic,
                        "name": name,
                        "score": score,
                        "comment_nums": comment_nums,
                        "desc_info": desc_info,
                    }
    
                    print(self.count, book_dict)
                    self.count += 1
    
                page += 1
                # break
    
        # 获取指定url对应的网页信息
        def get_html(self, url):
            # 通过requests获取不到想要的页面信息 所以改用selenium获取
            # headers = {"User-Agent": UserAgent().random}
            # response = requests.get(url, headers=headers)
            # html = response.text
            # print(html)
    
            # 创建浏览器对象
            self.driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1-windows\phantomjs"
                                                         r"-2.1.1-windows\bin\phantomjs.exe")
    
            # 使用无界面浏览器 发起请求
            self.driver.get(url)
            time.sleep(1)
            # 获取页面信息
            html = self.driver.page_source
            # print(html)
            # 将页面存入文件中 便于开发
            # with open("3.html", "r", encoding="utf-8") as f:
            #     html = f.read()
            # 返回一个xml对象
            return html, etree.HTML(html)
    
        def __del__(self):
            """
            触发条件: 当所有代码执行完成 执行此函数
            """
            # print(self.driver)
            # print(type(self.driver))
            self.driver.close()  # 关闭页面
            self.driver.quit()  # 关闭浏览器
            print("------浏览器已关闭-------")
    
    
    if __name__ == '__main__':
        dou = Dou()
        dou()
    
    

    淘宝接口

    import requests
    import json
    from fake_useragent import UserAgent
    '''
    分析:
    第一层:https://tce.taobao.com/api/mget.htm?callback=jsonp1606&tce_sid=1870316,1871653&tce_vid=2,2&tid=,&tab=,&topic=,&count=,&env=online,online
    1870316  第一层
    1870321  第二层
    1870333  第三层
    1870340  第四层
    1870341  第五层
    1870342  第六层
    1870343  第七层
    '''
    class Tao:
        def __init__(self):
            pass
        def __call__(self, *args, **kwargs):
            self.get_data()
        def get_data(self):
            base_url = "https://tce.taobao.com/api/mget.htm?tce_sid=1870316,1870321,1870333,1870340,1870341,1870342,1870343&tce_vid=2,2,2,2,2,2,2"
            headers = {"User-Agent": UserAgent().random}
    
            # 对接口发起请求
            response = requests.get(base_url, headers=headers)
            # 获取字符串数据
            str_data = response.text.strip()
            # print(str_data)
            # 获取json数据
            json_data = json.loads(str_data)
            # print(json_data)
            count = 1
            # 获取到的是一个字典
            data_dict = json_data.get('result')
            for i in data_dict.values():
                data_list = i.get("result")
                for data in data_list:
                    data["item_pic"] = "https:" + data["item_pic"]
                    print(count, data)
                    count += 1
    
    
    if __name__ == '__main__':
        tao = Tao()
        tao()
    
    

    11. 爬取网易云数据

    import requests
    from lxml import etree
    from fake_useragent import UserAgent
    
    class Music:
    
        def __init__(self):
            self.count = 1
    
        def __call__(self, *args, **kwargs):
            self.get_class_url_list()
    
        # 获取分类url列表
        def get_class_url_list(self):
            # 发起请求 获取指定页面
            base_url = "https://music.163.com/discover/artist"
            html_xml = self.get_html(base_url, 1)
    
            # 获取分类url
            class_url_list = html_xml.xpath("//a[@class='cat-flag']/@href")
            class_name_list = html_xml.xpath("//a[@class='cat-flag']/text()")
            del class_name_list[0]
            del class_url_list[0]
            # print(class_url_list)
            # print(class_name_list)
            # print(len(class_url_list))
            # print(len(class_name_list))
            for index in range(len(class_url_list)):
                # index += 1
                print("==============={}开始下载================".format(class_name_list[index]))
                # 拼接完整的分类url
                class_url = "https://music.163.com" + class_url_list[index]
                # print(class_url)
    
                # 通过分类url获取字母的url
                self.get_alphabet_url(class_url)
    
                # break
    
        def get_alphabet_url(self, class_url):
            # 获取分类url的页面 xml对象
            html_xml = self.get_html(class_url, 1)
    
            # 获取字母url列表
            alphabet_url_list = html_xml.xpath("//ul[@class='n-ltlst f-cb']/li[position()>1]/a/@href")
            # print(alphabet_url_list)
    
            # 循环获取每个字母url对应歌手信息
            for alphabet_url in alphabet_url_list:
                # 拼接完整的字母url
                alphabet_url = "https://music.163.com" + alphabet_url
    
                self.get_singer_info(alphabet_url)
                # break
    
        def get_singer_info(self, alphabet_url):
    
            # 根据字母url获取每个歌手的名称和对应的详情url
            html_xml = self.get_html(alphabet_url, 1)
    
            singer_name_list = html_xml.xpath("//a[@class='nm nm-icn f-thide s-fc0']/text()")
            singer_url_list = html_xml.xpath("//a[@class='nm nm-icn f-thide s-fc0']/@href")
            # print(singer_name_list)
            # print(singer_url_list)
            # print(len(singer_name_list))
            # print(len(singer_url_list))
            for index in range(len(singer_name_list)):
                # 声明一个存放歌手信息的字典
                singer_url = "https://music.163.com" + singer_url_list[index].strip()
    
    
    
                # import json
                # singer_dict = json.dumps(singer_dict)
                # with open("singer.txt", "w", encoding="utf-8") as f:
                #     f.write(singer_dict + "\n")
    
                html_xml = self.get_html(singer_url, 0)
                # tbody在页面当中显示 但是在代码获取到的页面中一般不显示
                hot_song = html_xml.xpath("//ul[@class='f-hide']/li/a/text()")
                # print(hot_song)
                singer_dict = {
                    "singer_name": singer_name_list[index],
                    "singer_url": singer_url,
                    "hot_song": hot_song
                }
                print(self.count, singer_dict)
                self.count += 1
                # break
    
        # 获取指定url对应的页面信息
        def get_html(self, url, sign):
            '''
            :param url: 要获取的url
            :param sign: 用于判断使用哪个 headers,如果是1 则使用上面的headers 否则使用下面的headers
            :return:
            '''
            # headers = {"User-Agent": UserAgent().random}
            # if sign == 0:
            headers = {
                "cookie": "[email protected]:-1:1; mail_psc_fingerprint=7fb6c5032f50ce8c1a07fdb15fd2251d; _iuqxldmzr_=32; _ntes_nnid=ec024cec32803d4dfd5c42e4e40cba08,1552969997617; _ntes_nuid=ec024cec32803d4dfd5c42e4e40cba08; WM_TID=eZJB4FRfmstFBVFRVFZ508IkS9OSa6K6; usertrack=CrHtiVyQhXO2rmpiAwOpAg==; UM_distinctid=16a307022e2b3-0b705b12e3ccd3-414f0c2a-100200-16a307022e3361; NTES_CMT_USER_INFO=72051947%7Cm13349949963_1%40163.com%7Chttp%3A%2F%2Fcms-bucket.nosdn.127.net%2F2018%2F08%2F13%2F078ea9f65d954410b62a52ac773875a1.jpeg%7Cfalse%7CbTEzMzQ5OTQ5OTYzXzFAMTYzLmNvbQ%3D%3D; vinfo_n_f_l_n3=dd7e8b71253298e9.1.0.1555590818606.0.1555590912731; [email protected]|1558093033|0|mail163|00&99|gud&1557298197&urs#bej&null#10#0#0|133963&1||[email protected]; WM_NI=ROVoQSBgJquFTl4wFtlT0uStCW6f1tfWf3lX6czDHARSzgJQQaXu0QDk3vv%2BGl8GXFZhvOKF0OdWlzFB5MvSmfqUF%2B2c8YDTYjUbcM1JWQMmcQImmDpluWXxtf50voINRkI%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eeb4ae3fbbed98abef7d9a9a8bb2d85a939f9aaff763ac9a8c96ae79b5989da6f52af0fea7c3b92a92919a90d45982b98692f84e98b4fc98c580b08c0096d2808189fa87b480a689aad4ef54f6bdb6a5cb4b928db688c95b93bf9896b35b88b5fd97f52185b4f8a8db4e9ab8bab0ca4ef491acb8ef72869efbaef559afbabfb6c521f2bdf8bac7609bb69b83e247f39699b2d067a18f878ef050b4b4bbb8db74b8bafbd1f5658b929e8ccc37e2a3; __remember_me=true; gdxidpyhxdE=YoWfxdQEE%2BgYxhtnKi5zVBa4eaecS1%2F%2BR48h%2FgaKUjHCIj9OPH8QnoJuU4VE%2BYq4zYxRiKjDWw%2BR%2Bey3b9tDY4PDQSfKUjPQkuqfkPZY6oDRPPZouWGNpQMKNdSy8lpSY7W7Syf90lWTaOUXDzSavZz%5Cw4A1LcvEXNtkeBjksCD5L%2F7O%3A1559116416164; _9755xjdesxxd_=32; NETEASE_WDA_UID=1866933109#|#1559115550866; MUSIC_U=065d91e631703dfb7280fe33a565a5643bafb378927678189c0459a4967381afd261a8a054abc7f1c2a0cd2f9ccbfca9b9370d24fa62f9d6c26e43e3ad55584d850eee1fae4e41b77955a739ab43dce1; __csrf=b8c227a578ab1044087e44fe79d5b402; JSESSIONID-WYYY=blMRzR0VnxMzQI3YWDAisc30pDmUBmsJPcTiRP5bRK0eGtlnRzQnG4Ee963zZ9jzGlA1pX1VyCx8kOkqhCRWwDpAw84JQ4RetEJunCyMYUjgW5d5l4gPYKBTMPkBPiDD8pM9JGynKZei2c338XnVcZBC939OsBPXQR5UlDjc5pZf%2FCew%3A1559119405744"
            }
            response = requests.get(url, headers=headers)
            html = response.text
            # 只打印 歌手信息的页面
            if sign == 0:
                # print(html)
                pass
            return etree.HTML(html)
    
    
    if __name__ == '__main__':
        music = Music()
        music()
    
    
    '''
    index {"singer": "ljj", "hot_song": ["", ""]}
    {"ljj": ["", ""]}
    '''
    

    12. 爬取链家租房信息数据

    import redis
    import requests
    from lxml import etree
    from fake_useragent import UserAgent
    import re
    import pymysql
    
    
    class CityArea:
    
        def __init__(self):
            # 初始化redis连接
            self.r = self.get_redis()
    
        def __call__(self, *args, **kwargs):
            self.get_city_area()
    
        # redis数据库连接
        def get_redis(self):
            return redis.Redis(host="127.0.0.1", port=6379, db=1)
    
        def get_city_area(self):
            # 获取城区信息
            base_url = "https://bj.lianjia.com/zufang/"
            html_xml = self.get_html(base_url)
    
            city_area_list = html_xml.xpath("//ul[@data-target='area']/li[position()>1]/a/@href | "
                           "//ul[@data-target='area']/li[position()>1]/a/text()")
            print(city_area_list)
            print(len(city_area_list))
    
            for city_area in city_area_list:
                if "zufang" in city_area:
                    city_area = "https://bj.lianjia.com" + city_area
                print(city_area)
                # 将城区信息插入数据库
                self.r.rpush("city_area_list", city_area)
    
        # 获取指定url对应xml页面
        def get_html(self, url):
            headers = {"User-Agent": UserAgent().random}
            response = requests.get(url, headers=headers)
            html = response.text
            # print(html)
            return etree.HTML(html)
    
    
    class BusinessCircle(CityArea):
    
        def __call__(self, *args, **kwargs):
            self.get_business_circle()
    
        # 通过城区url获取商圈url
        def get_business_circle(self):
            count = 1
            # 查询城区信息
            city_area_list = self.r.lrange("city_area_list", 0, -1)
            # print(city_area_list)
            for index in range(0, len(city_area_list), 2):
                # print(index)
                # 分别获取城区url和城区的名称
                city_area_url = city_area_list[index].decode("utf-8")
                city_area_name = city_area_list[index+1].decode("utf-8")
                print(city_area_url, city_area_name)
    
                # 获取城区url xml对象
                html_xml = self.get_html(city_area_url)
                # 获取商圈信息
                business_circle_list = html_xml.xpath("//div[@id='filter']/ul[4]/li[position()>1]/a/@href | "
                                                      "//div[@id='filter']/ul[4]/li[position()>1]/a/text()")
    
                print(business_circle_list)
                for index in range(len(business_circle_list)):
                    # 获取商圈列表中的信息
                    business_circle = business_circle_list[index]
                    # 将城区和商圈用-连接起来 存入数据库
                    if index % 2 == 1:
                        business_circle = city_area_name + "-" + business_circle_list[index]
                    print(count, business_circle, type(business_circle))
                    # print(type(business_circle))
                    count += 1
    
                    # 存入数据库
                    self.r.rpush("business_circle_list", business_circle)
    
                # break
    
    
    class Lian(CityArea):
    
        def __call__(self, *args, **kwargs):
            self.conn_mysql()
            self.count_ucid = 1
            self.get_page_url()
    
        def get_page_url(self):
            # 查询数据库中的商圈信息
            business_circle_list = self.r.lrange("business_circle_list", 0, -1)
            # print(business_circle_list)
            # 循环获取商圈url
            for index in range(0, len(business_circle_list), 2):
                # 分别获取商圈url和商圈名称
                business_circle_url = business_circle_list[index].decode("utf-8")
                # 拼接完整的商圈url
                business_circle_url = "https://bj.lianjia.com" + business_circle_url
                business_circle_name = business_circle_list[index+1].decode("utf-8")
                print(f"==================={business_circle_name}开始下载====================")
                print(business_circle_url, business_circle_name)
                # 获取商圈url指定xml页面
                html_xml = self.get_html(business_circle_url)
    
                # 获取最大页码
                max_page = html_xml.xpath("//div[@class='content__pg']/@data-totalpage")
                # 如果获取不到最大页码 则max_page 为空列表 然后跳过本次循环
                if not max_page:
                    continue
                max_page = int(max_page[0])
                # print(max_page, type(max_page))
    
                # 循环生成分页url
                for page in range(1, max_page+1):
                    # 拼接完整的分页url
                    page_url = business_circle_url + "pg{}/".format(page)
                    # print(page_url)
                    # 获取数据
                    self.get_data(page_url)
                    break
                break
    
        # 获取指定分页url的数据
        def get_data(self, page_url):
            # 获取分页url页面
            html_xml = self.get_html(page_url)
    
            # 缩小范围
            div_list = html_xml.xpath("//div[@class='content__list']/div")
    
            for div in div_list:
                # 图片
                pic = div.xpath(".//img/@data-src")[0]
                pic = pic.replace("250x182", "2500x1800")
                # print(pic)
    
                # 标题
                title = div.xpath(".//p[@class='content__list--item--title twoline']/a/text()")[0].strip()
                # print(title)
    
                # 城区
                city_area = div.xpath(".//p[@class='content__list--item--des']/a[1]/text()")[0]
    
                # 商圈
                business_circle = div.xpath(".//p[@class='content__list--item--des']/a[2]/text()")[0]
                # print(city_area, business_circle)
    
                # 面积
                area = div.xpath(".//p[@class='content__list--item--des']//text()[4]")
                area = area[0].strip() if area else ""  # 空值处理
                # print(area)
    
                # 朝向
                toward = div.xpath(".//p[@class='content__list--item--des']//text()[5]")[0].strip()
                # print(toward)
    
                # 房间信息
                fang_info = div.xpath(".//p[@class='content__list--item--des']//text()[6]")[0].strip()
                # print(fang_info)
                room = re.findall("(\d+)室", fang_info)  # 室
                hall = re.findall("(\d+)厅",fang_info)  # 厅
                toilet = re.findall("(\d+)卫", fang_info)  # 卫
                # 空值处理
                room = int(room[0]) if room else 0
                hall = int(hall[0]) if hall else 0
                toilet = int(toilet[0]) if toilet else 0
                # print(room, hall, toilet)
    
                # 发布时间
                publish_date = div.xpath(".//p[@class='content__list--item--time oneline']/text()")[0]
                # print(publish_date)
    
                # 标签
                sign_list = div.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()")
                # print(sign_list)
                # 将标签转换为字符串
                sign = "#".join(sign_list)
                # print(sign)
    
                # 价格
                price = div.xpath(".//em/text()")[0]
                # print(price)
    
                # 详情url
                detail_url = div.xpath(".//p[@class='content__list--item--title twoline']/a/@href")[0]
                # 拼接完整的详情url
                detail_url = "https://bj.lianjia.com" + detail_url
                # print(detail_url)
    
                fang_dict = {
                    "pic": pic, "title": title, "city_area": city_area, "business_circle": business_circle,
                    "area": area, "toward": toward, "room": room, "hall": hall, "toilet": toilet,
                    "publish_date": publish_date, "sign": sign, "price": price, "detail_url": detail_url
                }
    
                self.parse_detail(fang_dict)
    
        # 解析详情页
        def parse_detail(self, fang_dict):
    
            # print(fang_dict)
            detail_url = fang_dict['detail_url']
            print(detail_url)
    
            # 获取详情url对应的xml对象
            html_xml = self.get_html(detail_url)
    
            floor = html_xml.xpath("//ul/li[@class='fl oneline'][8]/text()")
            floor = floor[0] if floor else ""
            # print(floor)
    
            # 获取经纪人电话号码 不在页面中
            # 电话号码在接口中
            # phone = html_xml.xpath(".//p[@class='content__aside__list--bottom oneline phone']/text()")
            # print(phone)
    
            # 获取经纪人id号 ucid
            ucid = self.get_ucid(html_xml)
            # print(ucid)
            # 获取house_code
            house_code = re.findall("zufang/(.*?).html", detail_url)[0]
            # print(house_code)
    
            # 拼接完整的经纪人接口
            agent_url = f"https://bj.lianjia.com/zufang/aj/house/brokers?" \
                        f"house_codes={house_code}&position=bottom" \
                        f"&ucid={ucid}"
            # print(agent_url)
            try:
                # 获取接口中的信息
                headers = {"User-Agent": UserAgent().random}
                json_data = requests.get(agent_url, headers=headers).json()
                # print(json_data)
                phone = json_data.get("data")[house_code][house_code].get("tp_number")
    
                # print(phone)
            except Exception as e:
                print(e)
                phone = ''
    
            # 将电话和楼层信息放到fang_dict中
            fang_dict["floor"] = floor
            fang_dict["phone"] = phone
    
            self.insert_mysql(fang_dict)
    
        def insert_mysql(self, fang_dict):
            print(self.conn)
            print(self.cur)
    
        def conn_mysql(self):
            # 创建数据库的连接对象
            self.conn = pymysql.connect(host="127.0.0.1", user="root",
                                        database="0218", charset="utf8")
            # 创建操作数据库的对象
            self.cur = self.conn.cursor()
    
        def get_ucid(self, html_xml):
    
            try:
                ucid = html_xml.xpath("//span[@class='contact__im im__online']/@data-info")[0]
                # print(ucid)
                self.count_ucid = 1
                return ucid
            except Exception as e:
                print(e)
                if self.count_ucid == 3:
                    return ""
                else:
                    self.count_ucid += 1
                    return self.get_ucid(html_xml)
    
    
    # ucid = self.get_ucid() = self.get_ucid(html_xml) = ucid
    
    
    if __name__ == '__main__':
        # cityarea = CityArea()
        # cityarea()
        # 实例化BusinessCircle bc为当前类的对象 调用时触发__call__
        # bc = BusinessCircle()
        # bc()
        lian = Lian()
        lian()
    
    
    
    
    '''
    电话接口分析:
    https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2259333770690183168&position=bottom&ucid=1000000026012783
    https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2234691835526389760&position=bottom&ucid=1000000023002201
    
    '''
    
    

    你可能感兴趣的:(爬虫)