项目的请求url类型https://k.autohome.com.cn/detail/view_01cezq86y568r3ad1m6ws00000.html?st=4&piap=0|3170|0|0|1|0|0|0|0|0|1#pvareaid=2112108
以前有写过汽车之家的爬虫,但是有一段时间没有爬了,所以网站也更新了。
现在2018.8.23号的情况是这样,请求url后,返回的数据是很长的js加上自定义字符的请求连接,
网页是先加载js,js操作页面元素,将运行的结果进行替换,得到16进制的,然后字体文件在进行替换,
1,第一个困难点就是得到替换的KC_的值,这里我是将源代码中的js和需要变更的部分,简单修改替换的值,然后加上jquer文件生成一个html,最后使用Chrome渲染,关掉图片加载,秒渲染~。然后得到我需要替换的文字,这里的文字是和字体文件的名字是对应的
2.,以前字体文件虽然名称和顺序改变,但是字体结构是不变的,现在字体结构是随机偏移5位移,这里我是找了资料,
用fontTools库的函数,将字体生成图片格式化输出,然后使用TensorFlow将2套图作为样本,训练了一下,然后将得到的图片进行识别,正确率是100%。
3.然后将图片分类得到的字符串将代码中的结果替换掉,得到和网页完全一样的结果。
这里就简单放一个主逻辑的代码
import requests, re,time from lxml import etree from fontTools.ttLib import TTFont from save_png import mainn from cnnn import te from req_js_str import WebDri from mysql_conn_info import MysqlHelper my_db = MysqlHelper() webdr = WebDri() def run(info_url): url = info_url headers = { "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", } response = requests.get(url=url, headers=headers, timeout=15) data = response.text with open("搜索结果aa.html", "w", encoding="utf-8") as f: f.write(data) # with open("搜索结果.html", "r", encoding="utf-8") as f: # data = f.read() html = etree.HTML(data) #发布的时间 publish_time = html.xpath("""//div[@class="title-name name-width-01"]/b/text()""")[0].strip() #发布的文章标题 title = html.xpath("""//div[@class="kou-tit"]/h3/text()""")[0] # text_con = html.xpath("""//div[@class="text-con"]""")[0] # text_con = etree.tostring(text_con,method="html").decode("utf-8") # 得到需要解析的js text_con = re.findall(r"""""","我张壮\1",text_con) html_ = re.sub("""需要替换的文字""",text_con,data2,1,re.S,) g.write(html_) #调用渲染 js_text = webdr.get_() #找到需要替换的文字列表 list_wenz = re.findall(r"我张壮.e...",js_text) #得到需要进行图像识别的文字列表 set_wenz = ["uni"+uni[-4:].upper() for uni in set(list_wenz)] set_wenz1 = [uni[-4:] for uni in set(list_wenz)] #请求得到字体文件 rst = re.findall(".{80}ttf", data)[0] rst = re.findall("\('(.*?ttf)", rst)[0] url = "https:" + rst ttf = requests.get(url,headers=headers) with open("qczj1.ttf", "wb") as f: f.write(ttf.content) # qczjFont = TTFont("qczj1.ttf") # qczjFont.getGlyphOrder() #得到当前90个字体文件名 # uniList = qczjFont['cmap'].tables[0].ttFont.getGlyphOrder() # print(uniList) # utf8List = [uni[3:] for uni in uniList[1:]] # print(utf8List) str_list = [] for i in set_wenz: # 保存图片,后面修改成获取图片就行 # print(i) mainn(i) str_ = te("F:\\汽车之家\\autohome\\prct\\"+i+".jpeg") str_ = iff(int(str_)) str_list.append(str_) for i in range(len(str_list)): _ = """我张壮\x01%s;"""%set_wenz1[i] #获取到文章的全部内容 js_text = re.sub(_,str_list[i],js_text) # 存数据 print(js_text) content = js_text my_db.update((publish_time,title,content,info_url)) def iff(data): if data == 0: data = "呢" elif data == 1: data = "了" elif data == 2: data = "右" elif data == 3: data = "电" elif data == 4: data = "近" elif data == 5: data = "音" elif data == 6: data = "上" elif data == 7: data = "不" elif data == 8: data = "小" elif data == 9: data = "性" elif data == 10: data = "味" elif data == 11: data = "自" elif data == 12: data = "二" elif data == 13: data = "机" elif data == 14: data = "软" elif data == 15: data = "泥" elif data == 16: data = "油" elif data == 17: data = "空" elif data == 18: data = "只" elif data == 19: data = "好" elif data == 20: data = "手" elif data == 21: data = "下" elif data == 22: data = "启" elif data == 23: data = "地" elif data == 24: data = "量" elif data == 25: data = "少" elif data == 26: data = "档" elif data == 27: data = "路" elif data == 28: data = "灯" elif data == 29: data = "当" elif data == 30: data = "六" elif data == 31: data = "得" elif data == 32: data = "养" elif data == 33: data = "孩" elif data == 34: data = "实" elif data == 35: data = "硬" elif data == 36: data = "很" elif data == 37: data = "开" elif data == 38: data = "坏" elif data == 39: data = "冷" elif data == 40: data = "一" elif data == 41: data = "来" elif data == 42: data = "保" elif data == 43: data = "八" elif data == 44: data = "多" elif data == 45: data = "高" elif data == 46: data = "三" elif data == 47: data = "过" elif data == 48: data = "皮" elif data == 49: data = "级" elif data == 50: data = "响" elif data == 51: data = "无" elif data == 52: data = "中" elif data == 53: data = "门" elif data == 54: data = "耗" elif data == 55: data = "雨" elif data == 56: data = "远" elif data == 57: data = "身" elif data == 58: data = "坐" elif data == 59: data = "更" elif data == 60: data = "四" elif data == 61: data = "内" elif data == 62: data = "矮" elif data == 63: data = "五" elif data == 64: data = "左" elif data == 65: data = "加" elif data == 66: data = "里" elif data == 67: data = "问" elif data == 68: data = "短" elif data == 69: data = "着" elif data == 70: data = "七" elif data == 71: data = "副" elif data == 72: data = "低" elif data == 73: data = "和" elif data == 74: data = "长" elif data == 75: data = "光" elif data == 76: data = "动" elif data == 77: data = "是" elif data == 78: data = "外" elif data == 79: data = "控" elif data == 80: data = "十" elif data == 81: data = "比" elif data == 82: data = "真" elif data == 83: data = "盘" elif data == 84: data = "排" elif data == 85: data = "公" elif data == 86: data = "有" elif data == 87: data = "的" elif data == 88: data = "九" elif data == 89: data = "大" else: print("数据替换出错") return data if __name__ == '__main__': info_url = my_db.get("select info_url from text_info order by id asc limit 0,10;") for i in info_url: print(i[0]) run(i[0]) time.sleep(20)