from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def search(url, key):
try:
browser.get(url)
wait = WebDriverWait(browser, 10)
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#key")) # 导入的by包在这用,注意括号使用
)
button = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
)
input.send_keys(key)
button.click()
page_number = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b'))
)
print('正在解析第1页')
return page_number.text
except TimeoutException:
return main()
def page_next(page_number):
try:
page_input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input"))
)
page_click = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > a"))
)
page_input.clear() # 别忘了这个千万
page_input.send_keys(page_number)
page_click.click()
wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page_number))
)
except TimeoutException:
page_next(page_number)
def product():
waits = WebDriverWait(browser, 20) # 因为接下来加载的是图片,要把时间定义长点
waits.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '#J_goodsList')
))
html = browser.page_source
doc = pq(html)
items = doc('#J_goodsList > ul li').items() # 注意选择id和class和节点的区别
这里应该用JavaScript来让京东页面下拉,并且要调整下拉速度,保证所有商品信息都可以被加载。但是我只会让页面直接翻到最下方,速度不会控制。
for item in items:
products = {
'picture': item.find('img').attr('src'), # find 是在遍历子孙节点,是pyquery函数
'price': item.find('i').text()[:5],
'title': item.find('em').text()
这里应该添加一个店铺名称,我用pyquery提取不出来,正则表达式试了半天也不行索性就不添加店铺名称了。
}
print(products)
def main():
url = "https://www.jd.com/"
key = "手机"
page_number = int(search(url=url, key=key))
product()
for i in range(2, 5): # 循环出来的会少一个,没多少流量只能定义这几页,笑哭
print('正在解析第%d页' % i)
page_next(i)
product()
if __name__ == '__main__':
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 15)
main()
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def search(url, key):
try:
browser.get(url)
wait = WebDriverWait(browser, 10)
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#key")) # 导入的by包在这用,注意括号使用
)
button = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
)
input.send_keys(key)
button.click()
page_number = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b'))
)
print('正在解析第1页')
return page_number.text
except TimeoutException:
return main()
def page_next(page_number):
try:
page_input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input"))
)
page_click = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > a"))
)
page_input.clear() # 别忘了这个千万
page_input.send_keys(page_number)
page_click.click()
wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page_number))
)
except TimeoutException:
page_next(page_number)
def product():
waits = WebDriverWait(browser, 20) # 因为接下来加载的是图片,要把时间定义长点
waits.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '#J_goodsList')
))
html = browser.page_source
doc = pq(html)
items = doc('#J_goodsList > ul li').items() # 注意选择id和class和节点的区别
for item in items:
products = {
'picture': item.find('img').attr('src'), # find 是在遍历子孙节点,是pyquery函数
'price': item.find('i').text()[:5],
'title': item.find('em').text()
}
print(products)
def main():
url = "https://www.jd.com/"
key = "手机"
page_number = int(search(url=url, key=key))
product()
for i in range(2, 5): # 循环出来的会少一个,没多少流量只能定义这几页,笑哭
print('正在解析第%d页' % i)
page_next(i)
product()
if __name__ == '__main__':
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 15)
main()