import datetime
import os
import random
import sqlite3
import threading
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
class MySpider:
headres = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/70.0.3538.25 Safari/537.36 "
"Core/1.70.3741.400 QQBrowser/10.5.3863.400"
}
imagePath = "download/dianfanguo"
def startUp(self, url, key, getPage=0):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(chrome_options=chrome_options)
self.threads = []
self.No = 0
self.imgNo = 0
self.Page = 1
self.getPage = getPage
try:
self.con = sqlite3.connect("phones.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute("drop table phones")
except Exception as err:
print("删除曾经phones表err")
pass
try:
sql = "create table phones (" \
"mNo varchar(32) primary key, " \
"mMark varchar (256)," \
"mPrice varchar (32)," \
"mNote varchar (1024)," \
"mFile varchar (256))"
self.cursor.execute(sql)
except Exception as err:
print("建立新phones表err")
except Exception as err:
print(err)
try:
if not os.path.exists(MySpider.imagePath):
os.mkdir(MySpider.imagePath)
images = os.listdir(MySpider.imagePath)
for img in images:
s = os.path.join(MySpider.imagePath, img)
os.remove(s)
except Exception as err:
print(err)
self.driver.get(url)
keyInput = self.driver.find_element_by_id("key")
keyInput.send_keys(key)
keyInput.send_keys(Keys.ENTER)
def closeUp(self):
try:
self.con.commit()
self.cursor.close()
self.con.close()
self.driver.close()
except Exception as err:
print(err)
def insertDB(self, mNo, mMark, mPrice, mNote, mFile):
try:
sql = "insert into phones (mNO,mMark,mPrice,mNote,mFile) values(?,?,?,?,?)"
self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
except Exception as err:
print("输入数据:{}失败".format(mNO))
print(err)
def showDB(self):
try:
con = sqlite3.connect("phones.db")
cursor = con.cursor()
print("%-8s %-16s %-8s %-16s %s" % ("No", "Mark", "Price", "Image", "Note"))
cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones order by mNo")
rows = cursor.fetchall()
for row in rows:
print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))
cursor.close()
con.close()
except Exception as err:
print(err)
def download(self, src1, src2, mFile):
data = None
if src1:
try:
req = urllib.request.Request(src1, headers=MySpider.headres)
resp = urllib.request.urlopen(req, timeout=400)
data = resp.read()
except:
pass
if not data and src2:
try:
req = urllib.request.Request(src2, headers=MySpider.headres)
resp = urllib.request.urlopen(req, timeout=400)
data = resp.read()
except:
pass
if data:
fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
fobj.write(data)
fobj.close()
print("download", mFile)
def processSpider(self):
try:
time.sleep(random.uniform(1, 2))
print(self.driver.current_url)
lis = self.driver.find_elements_by_xpath(
"//div[@id='J_goodsList']//li[@class='gl-item']")
for li in lis:
try:
src1 = li.find_element_by_xpath(
".//div[@class='p-img']//a//img").get_attribute("src")
except:
src1 = ""
try:
src2 = li.find_element_by_xpath(
".//div[@class='p-img']//a//img").get_attrivute("data-lazy-img")
except:
src2 = ""
try:
price = li.find_element_by_xpath(
".//div[@class='p-price']//i").text
except:
price = "0"
try:
note = li.find_element_by_xpath(
".//div[@class='p-name p-name-type-2']//em").text
note = note.strip()
note = note.replace(" ", "")
mark = note.split(" ")[0]
except:
note = ""
mark = ""
self.No = self.No + 1
no = str(self.No)
while len(no) < 6:
no = "0" + no
print(no, mark, price)
if src1:
src1 = urllib.request.urljoin(self.driver.current_url, src1)
p = src1.rfind(".")
mFile = no + src1[p:]
elif src2:
src2 = urllib.request.urljoin(self.driver.current_url, src2)
p = src2.rfind(".")
mFile = no + src2[p:]
if src1 or src2:
T = threading.Thread(target=self.download, args=(src1, src2, mFile))
T.setDaemon(False)
T.start()
self.threads.append(T)
else:
mFile = ""
self.insertDB(no, mark, price, note, mFile)
try:
self.driver.find_element_by_xpath(
"//span[@class='p-num']//a[@class='pn-next disabled']")
except:
if self.Page < self.getPage or self.getPage == 0:
nextPage = self.driver.find_element_by_xpath(
"//span[@class='p-num']//a[@class='pn-next']")
nextPage.click()
self.Page += 1
self.processSpider()
except Exception as err:
print(err)
def executeSpider(self, url, key, getPage):
starttime = datetime.datetime.now()
print("Spider starting......")
self.startUp(url, key, getPage)
print("startUp")
self.processSpider()
print("processSpider")
self.closeUp()
print("closeUp")
for t in self.threads:
t.join()
print("Spider completed.....")
endtime = datetime.datetime.now()
elapsed = (endtime - starttime).seconds
print("Total", elapsed, "seconds elapsed")
url = "https://www.jd.com/"
key = "电饭锅"
num = 3
spider = MySpider()
while True:
print("1.爬取")
print("2.显示")
print("3.退出")
s = input("请选择(1,2,3):")
if s == "1":
spider.executeSpider(url, key, getPage=num)
elif s == "2":
spider.showDB()
elif s == "3":
break