网络爬虫-抓取全国高校名单(分区域)

参考链接: 从人人网获取全国中学信息(省市县)

主要代码为参考+改进上面博客的原创,在人人网的select弹框form里面抓取出全国高校名单.

网络爬虫-抓取全国高校名单(分区域)_第1张图片
网络爬虫-抓取全国高校名单(分区域)_第2张图片

主要代码块如下

def getProvinceData():
    content = open("./cityArray.js", encoding='utf-8')
    # 分离出市级id和名称
    partten = re.compile("(\d+):([\w\d\\\\]+)")
    provinceList = []
    for line in content.readlines():
        data = partten.findall(line)
        citys = []
        province = {}
        for s in data:
            # print(s)
            if len(s[0]) == 4:  # 城市
                # print s[0],s[1].decode('unicode_escape')
                citys.append({"id": s[0], "name": unescape(s[1])})

        province_id = len(data[0][0]) == 4 and data[0][0] or data[0][0][0:4]

        # 只处理列表中的几个省
        if int(province_id) in provinceMap.keys():
            province['id'] = province_id
            province['name'] = provinceMap[int(province_id)]
            province['citys'] = citys
            provinceList.append(province)

    return provinceList

def getTownHtml(town_id, scoolType):
    try:
        url = "http://support.renren.com/{0}/{1}.html".format(scoolType, town_id)
        # print "请求网络数据:",url
        a = requests.get(url, headers=headers).text
        print(a)
        return a
    except:
        print("网络错误!")
        pass


def getCitySchool(content):
    selector = etree.HTML(content)

    # 某个城市的中学列表
    # 县区的列表
    townlist = selector.xpath('//ul')
    # print(townlist)
    d = {}
    for town1 in townlist:
        name1 = town1.xpath('./@id')[0].strip()
        if name1 == "schoolCityQuList":
            d["city"] = []
            city1 = town1.xpath('.//a')
            for y in city1:
                y1 = etree.tostring(y, encoding='utf-8', pretty_print=True, method="html").decode(encoding="utf-8")
                d["city"].append({
                    "name": re.findall('>(.*?)', y1)[0],
                    "id": re.findall("'city_qu_(.*?)'", y1)[0]
                })
            continue
        citySchoolData = []
        townLiList = town1.xpath('.//a')
        for town in townLiList:
            p = {}
            town = etree.tostring(town, encoding='utf-8', pretty_print=True, method="html").decode(encoding="utf-8")
            print(town)
            # input()
            try:
                p['name'] = re.findall('>(.*?)', town)[0]
            except:
                p['name'] = re.findall('>(.*?)\n', town)[0]

            p['id'] = re.findall('href="(.*?)"', town)[0]

            citySchoolData.append(p)

        for x in d.get('city'):
            if name1.replace('city_qu_', '') == x.get('id'):
                x['schoolList'] = citySchoolData
    return d.get('city')

def getUnicodeStr(s):
    name = []
    for word in s.split(";"):
        try:
            name.append(chr(int(word[2:])))
        except:
            pass
    return "".join(name)

结果展示(最终格式为json):

网络爬虫-抓取全国高校名单(分区域)_第3张图片
网络爬虫-抓取全国高校名单(分区域)_第4张图片

Ending

Github传送门

持续更新ing (欢迎各种star与fork)

联系方式: 442891187(QQ)

如有权益问题可以发私信联系我删除

你可能感兴趣的:(MySpider)