JAVA代码
@RequestMapping("public/test1")
@ApiImplicitParam(paramType = "form", dataType = "int", name = "url", value = "商品路径", required = true)
public void test(HttpServletRequest request){
String url = request.getParameter("url");
String[] args1 = new String[] { "python", "E:\\python\\project\\test\\test2.py", url};
StringBuilder result = new StringBuilder();
try {
Process process = Runtime.getRuntime().exec(args1);
BufferedReader in = new BufferedReader(new InputStreamReader(
process.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result.append(line);
}
in.close();
process.waitFor();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println(result.toString());
JSONObject json = JSONObject.parseObject(result.toString());
System.out.println(json.get("DETAIL"));
}
python脚本(抄的)
import requests
import re, sys, os
import json
import threading
import pprint
class spider:
def __init__(self, url, name):
self.url = url
self.headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
"Accept-Encoding": "gzip",
"Accept-Language": "zh-CN,zh;q=0.8",
"Referer": "http://www.example.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
self.name = name
def openurl(self, url):
self.request = requests.get(url, headers=self.headers)
if self.request.ok:
return self.request.text
def matchs(self):
tmall_exp = r"Setup\(([\s\S]+?)\);" ### 匹配商品数据的正则
detail = r"src=\"(https://img\S+?[jpgifn]+?)\"" ###匹配 商品详情图的正则
html = self.openurl(self.url)
data = re.findall(tmall_exp, html)
data = json.loads(data[0])
main_img = data['propertyPics'] ## 这里包括了主图和颜色图的地址
color_data = data['valItemInfo']['skuList'] ### 这里获得商品的颜色信息列表 包括颜色编码 颜色名称,商品skuID
detail_html = self.openurl("http:" + data['api']["httpsDescUrl"])
detail_image = re.findall(detail, detail_html)
self.newdata = {"MAIN": main_img['default'], "DETAIL": detail_image}
# psvs = []
# self.newdata['COLOR'] = []
#
# for v in range(len(color_data)):
# if ";" in color_data[v]["pvs"]:
# psv = color_data[v]['pvs'][color_data[v]['pvs'].find(";") + 1:]
# else:
# psv = color_data[v]['pvs']
# if psv in psvs:
# continue
# psvs.append(psv)
# self.newdata['COLOR'].append({color_data[v]["names"]: main_img[";" + psv + ";"]})
return self.newdata
# def download(self):
# if len(self.newdata) > 0:
# for x in range(len(self.newdata['MAIN'])):
# threading.Thread(target=self.download_main, args=(self.newdata['MAIN'][x], x)).start()
#
# for x in self.newdata['COLOR']:
# threading.Thread(target=self.download_color, args=(x,)).start()
# for x in range(len(self.newdata['DETAIL'])):
# threading.Thread(target=self.download_detail, args=(self.newdata['DETAIL'][x], x)).start()
# return
# def download_main(self, url, index):
# try:
# img = requests.get("http:" + url, stream=True, headers=self.headers, timeout=10)
#
# except:
# print(sys.exc_info())
# return
# if img.ok:
# if not os.path.exists(self.name + "/main"):
# try:
# os.makedirs(self.name + "/main")
# except:
# pass
# imgs = open(self.name + "/main/%s.jpg" % index, "wb")
# imgs.write(img.content)
# imgs.close()
# def download_color(self, url):
#
# try:
# img = requests.get("http:" + url[list(url.keys())[0]][0], stream=True, headers=self.headers, timeout=10)
#
# except:
# print(sys.exc_info())
# return
# if img.ok:
# if not os.path.exists(self.name + "/color"):
# try:
# os.makedirs(self.name + "/color")
# except:
# pass
# if "/" in list(url.keys())[0]:
# color = list(url.keys())[0].replace("/", "_")
# elif "\\" in list(url.keys())[0]:
# color = list(url.keys())[0].replace("\\", "_")
# else:
# color = list(url.keys())[0]
# imgs = open(self.name + "/color/%s.jpg" % color, "wb")
# imgs.write(img.content)
# imgs.close()
# def download_detail(self, url, index):
#
# try:
# img = requests.get(url, stream=True, headers=self.headers, timeout=10)
# except:
# print(sys.exc_info())
# return
# if img.ok:
# if not os.path.exists(self.name + "/detail"):
# try:
# os.makedirs(self.name + "/detail")
# except:
# pass
#
# imgs = open(self.name + "/detail/%s.jpg" % index, "wb")
# imgs.write(img.content)
#
# imgs.close()
if __name__ == "__main__":
#url="https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.3315572e5M4g31&id=547252985200&skuId=3477349595056&areaId=442000&user_id=2202630747&cat_id=2&is_b=1&rn=cbe7ac082a8e99e791c01d5aa2ec966e"
url = sys.argv[1]
taobao = spider(url, "下载图片/T")
data = taobao.matchs()
pprint.pprint(data)
#taobao.download()
其中JAVA调用时有个大坑,就是会报
java java.io.IOException: Cannot run program "python": CreateProcess error=2, 系统找不到指定的文件。
错误
在网上搜了半天,把能用的方法都用了,最后解决了,但不知道哪个方法有效
1、java的rum--run configurations...--Environment---new---Name:PATH;Value:python路径(如:E:\Program Files\Python\Python37)---Apply(或者Run)
2、配置python环境变量,发现无效,因为今天上午刚配的,网上说在MyEclipse启动时配置的Path不生效,需重启MyEclipse,重启后发现居然成了。。
另外,本来还想爬取淘宝详情的数据的,因为刚接触python,连正则都不会写,研究了半天无果,就先放弃了,如果有大佬会的话欢迎指点。
淘宝的话:tmall_exp = r"Setup\(([\s\S]+?)\);" ### 匹配商品数据的正则
这一句改为 tmall_exp = r"g_config\(([\s\S]+?)\);" ### 匹配商品数据的正则
可以打印匹配后的html看看