58同城爬虫

crawl_tool_for_py3_v4和MysqlPipeline是我自己写的工具类,这里就不附上了。

目标是爬取58同城的租房信息。遇到的问题有字体反爬ip封锁问题。58的联系方式只有免费用户才会公开,带有安居字样的都会变成58的虚拟号码,另外3个月以上的联系方式也不会显示。为了不出验证码,需要每个链接单独ip。

感觉爬取成本有点高。我本来还以为别人爬取是用了破解app之类的,但是现在估计就是避开了加密的号码。逆向还是太难了。

# -*- coding: utf-8 -*-
# @Time    : 2019/11/19 15:03
# @Author  : meng_zhihao
# @Email   : 312141830@qq.com
# @File    : 58phone.py

from crawl_tool_for_py3_v4 import crawlerTool as ct
import time
import json
from pipeline import MysqlPipeline

import re

import base64
from io import BytesIO
from fontTools.ttLib import TTFont

def get_page_show_ret(string,bs64_str):
    font = TTFont(BytesIO(base64.decodebytes(bs64_str.encode())))
    c = font.getBestCmap()
    ret_list = []
    for char in string:
        decode_num = ord(char)
        if decode_num in c:
            num = c[decode_num]
            num = int(num[-2:])-1
            ret_list.append(num)
        else:
            ret_list.append(char)
    ret_str_show = ''
    for num in ret_list:
        ret_str_show += str(num)
    return ret_str_show


def convert_info(info,bs64_str):
    info = info
    info = ct.getXpath('//text()', info)
    info = ' '.join(info)
    info = get_page_show_ret(info, bs64_str)
    info = info.replace('\n','')
    info = info.replace(' ', '')
    return info


def main():
    page_url = 'https://yiwu.58.com/chuzu/?PGTID=0d3090a7-0300-3c7f-dd90-4cb3210e1f51&ClickID=8'  # 义乌出租 感觉经纪人就必然是虚拟号 然后大部分都是付费的。。。
    for i in range(1,70,1):
        page_url = 'https://yiwu.58.com/chuzu/pn%s/?PGTID=0d3090a7-0300-35bc-792c-0181bfe9236f&ClickID=2' % i
        proxy = ct.get_new_1min_proxy()
        proxies = {'http': 'http://' + proxy, 'https': 'http://' + proxy}
        print(proxies)
        time.sleep(1)
        page_buf = ct.get(page_url,proxies=proxies)

        sub_urls = ct.getXpath('//h2/a/@href',page_buf)
        db_contact = MysqlPipeline()
        for url in sub_urls:
            try:
                print(url)
                proxy = ct.get_new_1min_proxy()
                proxies = {'http': 'http://' + proxy, 'https': 'http://' + proxy}
                page_id = ct.getRegex('/(\d+x).shtml',url)
                page_buf = ct.get(url,proxies=proxies)
                privacyCallUrl = ct.getRegex("privacyCallUrl = '(.*?)'",page_buf)
                print(privacyCallUrl)
                if privacyCallUrl:
                    time.sleep(4)
                    json_data = ct.get(privacyCallUrl,proxies=proxies)
                    json_data = json.loads(json_data.decode('utf8'))
                    print(json_data)
                    phone_num = json_data.get('data')
                else:
                    phone_num = ''
                title = ct.getXpath1("//h1/text()",page_buf)
                bs64_str = ct.getRegex("charset=utf-8;base64,(.*?)'\)", page_buf)

                money = ct.getXpath1('//div[@class="house-pay-way f16"]',page_buf) # 不对
                money = convert_info(money,bs64_str)
                print(money)
                lis = ct.getXpath('//ul[@class="f14"]/li',page_buf)
                zlfs = convert_info(lis[0] ,bs64_str)

                fwlx  = convert_info(lis[1] ,bs64_str)     # 房屋类型
                cxlc = convert_info(lis[2] ,bs64_str)
                szxq = convert_info(lis[3] ,bs64_str)
                ssqy = convert_info(lis[4] ,bs64_str)
                xxdz = convert_info(lis[5] ,bs64_str)

                data_obj = {}
                data_obj['money'] = money
                data_obj['title'] = title
                data_obj['zlfs'] = zlfs
                data_obj['fwlx'] = fwlx
                data_obj['cxlc'] = cxlc
                data_obj['szxq'] = szxq
                data_obj['ssqy'] = ssqy
                data_obj['xxdz'] = xxdz
                data_obj['phone_num'] = phone_num
                data_obj = json.dumps(data_obj)
                db_contact.insert_into_table(page_id,data_obj)

            except Exception as e:
                print(e)


if __name__ == '__main__':
    main()


你可能感兴趣的:(python爬虫)