SVG 是用于描述二维矢量图形的一种图形格式。它基于 XML 描述图形,对图形进行放大或缩小操作都不会影响图形质量。矢量图形的这个特点使得它被广泛应用在 Web 网站中。
已知:
类名:vhkjj4
坐标:(-316px -141px)----取正整数则为(316,141)
网站链接: http://www.dianping.com/shop/130096343/review_all
def down_data(url, cookie):
headers = {
"Cookie": cookie,
"Referer": "http://www.dianping.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
ret = requests.get(url, headers=headers).text
with open('dazhong.html', 'w', encoding='utf-8') as f:
f.write(ret)
url = 'http://www.dianping.com/shop/130096343/review_all'
cookie = ''
down_data(url, cookie)
css_url = re.findall('', ret)
css_url = 'https:' + css_url[0]
css_response = requests.get(css_url, headers=headers).text
with open('dazhong.css', 'w', encoding='utf-8') as f:
f.write(css_response)
css = open('dazhong.css', encoding='utf-8')
svg_urls = re.findall(r'.*?\[class\^="(.*?)"\]\{.*?background-image: url\((.*?)\);', css.read())
css.close()
for svg_url in svg_urls:
name, svg = svg_url
svg_url = 'https:' + svg
print(svg_url)
svg_response = requests.get(svg_url).text
with open(f'{name}_dazhong.svg', 'w', encoding='utf-8') as f:
f.write(svg_response)
with open('be_dazhong.svg', 'r', encoding='utf-8') as f:
svg_html = f.read()
sel = parsel.Selector(svg_html)
texts = sel.css('textPath')
paths = sel.css('path')
path_dict = {}
for path in paths:
path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]
count = 1
zpd_svg_dict = {} # y坐标和字符串的联系
for text in texts:
zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
count += 1
with open('dazhong.css', 'r', encoding='utf-8') as f:
css_html = f.read()
css_paths = re.findall('\.(.*?)\{background:-(\d+)\.0px -(\d+)\.0px;\}.*?', css_html) # 正则表达式条件根据css文件类标签更换
last_map = {}
for css_path in css_paths:
css_name, x, y = css_path
index = int(int(x) / 14) # font-size:14px;fill:#333;}
for i in zpd_svg_dict:
if int(y) > int(i):
pass
else:
try:
last_map[css_name] = zpd_svg_dict[i][index]
break
except IndexError as e:
print(e)
with open('dazhong.html', 'r', encoding='utf-8') as f:
ret = f.read()
svg_list = re.findall(' ', ret)
for svg in svg_list:
try:
ret = ret.replace(f' ', last_map[svg])
except KeyError:
print('KeyError', svg)
etre = etree.HTML(ret)
li_list = etre.xpath('//div[@class="reviews-items"]/ul/li')
for i in li_list:
print(i.xpath('div[@class="main-review"]/div[@class="review-words Hide"]/text()'))
import requests
from lxml import etree
import re
import parsel
def down_data(url, cookie):
headers = {
"Cookie": cookie,
"Referer": "http://www.dianping.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
ret = requests.get(url, headers=headers).text
with open('dazhong.html', 'w', encoding='utf-8') as f:
f.write(ret)
css_url = re.findall('', ret)
css_url = 'https:' + css_url[0]
css_response = requests.get(css_url, headers=headers).text
with open('dazhong.css', 'w', encoding='utf-8') as f:
f.write(css_response)
css = open('dazhong.css', encoding='utf-8')
svg_urls = re.findall(r'.*?\[class\^="(.*?)"\]\{.*?background-image: url\((.*?)\);', css.read())
print(svg_urls)
css.close()
for svg_url in svg_urls:
name, svg = svg_url
svg_url = 'https:' + svg
print(svg_url)
svg_response = requests.get(svg_url).text
with open(f'{name}_dazhong.svg', 'w', encoding='utf-8') as f:
f.write(svg_response)
def crack_data():
with open('be_dazhong.svg', 'r', encoding='utf-8') as f:
svg_html = f.read()
# with open('gs_dazhong.svg', 'r', encoding='utf-8') as f:
# svg_html += f.read()
# with open('rq_dazhong.svg', 'r', encoding='utf-8') as f:
# svg_html += f.read()
sel = parsel.Selector(svg_html)
texts = sel.css('textPath')
paths = sel.css('path')
path_dict = {}
for path in paths:
path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]
print(path_dict)
count = 1
zpd_svg_dict = {} # y坐标和字符串的联系
for text in texts:
zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
count += 1
print('zpd_svg_dict', zpd_svg_dict)
with open('dazhong.css', 'r', encoding='utf-8') as f:
css_html = f.read()
css_paths = re.findall('\.(.*?)\{background:-(\d+)\.0px -(\d+)\.0px;\}.*?', css_html) # 正则表达式条件根据css文件类标签更换
print(css_paths)
last_map = {}
for css_path in css_paths:
css_name, x, y = css_path
index = int(int(x) / 14) # font-size:14px;fill:#333;}
for i in zpd_svg_dict:
if int(y) > int(i):
pass
else:
try:
last_map[css_name] = zpd_svg_dict[i][index]
break
except IndexError as e:
print(e)
return last_map
def decode_html(last_map):
with open('dazhong.html', 'r', encoding='utf-8') as f:
ret = f.read()
svg_list = re.findall(' ', ret)
print(svg_list)
for svg in svg_list:
try:
ret = ret.replace(f' ', last_map[svg])
except KeyError:
print('KeyError', svg)
return ret
def get_text(html):
ret = html.replace('消费后评价', '')
etre = etree.HTML(ret)
li_list = etre.xpath('//div[@class="reviews-items"]/ul/li')
for i in li_list:
print(i.xpath('div[@class="main-review"]/div[@class="review-words Hide"]/text()'))
if __name__ == '__main__':
url = 'http://www.dianping.com/shop/130096343/review_all'
cookie = ''
try:
down_data(url, cookie)
except Exception as e:
print('遇到验证问题')
data = crack_data()
html = decode_html(data)
get_text(html)