大众点评页面手机号什么的不怕你看,也没弄中间号,而是做了前端svg混淆,有俩个方法,
1,截图,用OCR识别
2,代码判断(本篇使用)
直接上代码
"""
author:yaoye
date : 2019-03-20
"""
import requests
from random import choice
from bs4 import BeautifulSoup
from lxml import etree
import re
import numpy as np
class Dazhongdp(object):
def __init__(self):
ua = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)']
self.headers = {}
self.headers['User-Agent'] = choice(ua)
self.headers['Connection'] = 'keep-alive'
self.headers['Host'] = 'www.dianping.com'
def get_url(self):
url = 'http://www.dianping.com/shop/3500059'
return url
def parse_url(self,url):
print('@@@')
req = requests.get(url,headers = self.headers)
html= req.text
soup = BeautifulSoup(html,'lxml')
# name = soup.select('h1.shop-name')[0].get_text().strip()
# print(name)
phone = self.get_phone(html,url)
def get_phone(self,html,url):
r = etree.HTML(html)
item = r.xpath('//p[@class="expand-info tel"]')[0]
# print(item)
item = etree.tostring(item).decode('utf-8')
# print(item)
items= [re.sub('[\s\S]*','',item,re.S)]
if re.findall(' ',item):
items = items[0].split(' ')
print(items)
code = self.get_css_code(url)
for item in items:
item_list = re.findall(' |([\d]+)',item,re.S)
print(item_list)
rel_list = []
for i in item_list:
if i[0]:
rel_list.append(i[0])
continue
else:
rel_list.append(i[1])
print(rel_list)
matchlist =self.match_code(rel_list,code)
print(''.join(matchlist))
def match_code(self,rel_list,code):
matchlist = []
for i in rel_list:
if len(i) >1:
num = self.css_xy(i,code)
matchlist.append(num)
else:
matchlist.append(i)
return matchlist
def get_css(self,url):
response =requests.get(url,headers = self.headers).text
html = etree.HTML(response)
r = html.xpath('//link[@rel="stylesheet"]/@href')
print(r)
return 'http:'+r[1]
def get_css_code(self,url):
css_url = self.get_css(url)
css_header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 's3plus.meituan.net',
'If-Modified-Since': 'Wed, 06 Mar 2019 14', 'If-None-Match': '"4684b1c3dee8f4d172b2ac0c11e827a1"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
css = requests.get(css_url, headers=css_header)
css.encoding = 'utf-8'
code = css.text
return code
def css_xy(self,i,code):
svg_url ='http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/f8350660159e938ca81d948ca9d0d555.svg'
result = re.search(i+'{background:([\s\S]*?)px ([\s\S]*?)px;}',code,re.S).groups()
x = abs(int(float(result[0])))+6
y = abs(int(float(result[1])))+30
print(i,result,x,y)
num = self.get_svg(svg_url,x,y)
return num
def get_svg(self,svg_url,x_,y):
response = requests.get(svg_url)
response.encoding = 'utf-8'
# print(response.text)
html= etree.HTML(response.content)
y_list = html.xpath('//text/@y')
y_list = np.array(y_list)
y_list = y_list.astype(np.int64)
print(y_list)
y_index = np.abs(y_list - y).argsort()[0]
y_ = y_list[y_index]
x = html.xpath('//text[@y="{y}"]/@x'.format(y=y_))
num = html.xpath('//text[@y="{y}"]/text()'.format(y=y_))
dict_x = dict(zip(x[0].split(), list(num[0])))
return dict_x[str(x_)]
def parse_item(self):
url = self.get_url()
self.parse_url(url)
# self.get_css_code(url)
A = Dazhongdp()
A.parse_item()