根据关键词自动搜索并爬取网页的信息
网页有两种情况:可以直接获取页数的和不可以直接获取页数的;
两种情况可以采取不同的方法:
情况一:先爬取页数,再爬取每页的数据
# coding=utf-8
import pandas as pd
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import re
import random
option = webdriver.ChromeOptions()
option.add_argument("headless")
# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe"
, options=option)
head_url = "部分的头部URL+key="
keywords_all = []
keywords = keywords_all[410:444]
keyword_list = []
product_name_list = []
company_name_list = []
company_url_list = []
phone_list = []
def PageNumber(keyword):
wd = urllib.parse.quote(keyword.encode('gb2312'))
turn_url = head_url + wd + ';use_cas=0;f=pclist;p=0'
driver.get(turn_url)
# print(driver.page_source)
time.sleep(random.randint(1,3))
try:
source = driver.find_element(By.XPATH
,"//div[@class='gys']/dl/dt/span").text
reg = re.findall(r".*有(.*)家", source)
page_number = int(reg[0])
print("共有",page_number,"条数据")
return page_number
except:
return -1
def GetResult(keyword, page):
wd = urllib.parse.quote(keyword.encode('gb2312'))
turn_url = head_url + wd + ';use_cas=0;f=pclist;p=' + str(page)
print(turn_url)
try:
driver.get(turn_url)
time.sleep(random.randint(2,4))
list = driver.find_elements(By.XPATH
, "//div[@class='gys']/dl/dd/form")
for l in list:
company = l.find_element(By.XPATH, "./table/tbody/tr/td/a").text
print(company)
company_name_list.append(company)
company_url = l.find_element(By.XPATH,"./table/tbody/tr/td/a[1]").get_attribute('href')
print(company_url)
company_url_list.append(company_url)
phone = l.find_element(By.XPATH, "./table/tbody/tr[2]/td[2]").text
print(phone)
phone_list.append(phone)
print(keyword)
keyword_list.append(keyword)
except:
print('get不到页面')
for i in keywords:
this_page = 0
page_number = int((PageNumber(keyword=i))/10)
if page_number == 0:
try:
GetResult(keyword=i, page=0)
except:
continue
elif page_number == -1:
print(i,'无数据')
else:
for p in range(0,page_number):
try:
GetResult(keyword=i, page=p)
except:
continue
data_list = []
for a, b, c, d in zip(keyword_list, company_name_list, company_url_list, phone_list):
x = {}
x['keyword'] = a
x['company_name'] = b
x['company_url'] = c
x['phone'] = d
data_list.append(x)
# print(data_list)
with open(r"###.csv", 'w', newline='', encoding='UTF-8') as f_c_csv:
writer = csv.writer(f_c_csv)
writer.writerow(['keyword' ,'company_name', 'company_url', 'phone'])
for nl in data_list:
writer.writerow(nl.values())
print("写入完成!")
情况二:无法爬取到页码数,只能换页爬取的
# coding=utf-8
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import random
import pandas as pd
option = webdriver.ChromeOptions()
option.add_argument("headless")
# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe, options=option)
head_url = "部分头url+keyword="
keywords_all = []
keywords = keywords_all[400:444]
keyword_list = []
product_name_list = []
company_name_list = []
company_url_list = []
mobilephone_list = []
telephone_list = []
def NextPage(keyword, page):
wd = urllib.parse.quote(keyword.encode('utf-8'))
if page == 0:
turn_url = head_url + wd
else:
turn_url = head_url + wd + "&p="+ str(page)
print(turn_url)
driver.get(turn_url)
time.sleep(random.randint(1,3))
list = driver.find_elements(By.XPATH
,"//div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")
return len(list)
def GetResult(keyword, page):
wd = urllib.parse.quote(keyword.encode('utf-8'))
if page == 0:
turn_url = head_url + wd
else:
turn_url = head_url + wd + "&p=" + str(page)
driver.get(turn_url)
time.sleep(random.randint(3,5))
try:
list = driver.find_elements(By.XPATH
, "//div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")
for l in list:
product_name = l.find_element(By.XPATH, "./div[@class='pro-info']/div[@class='intro-box']/div[@class='tt']/a").text
print(product_name)
product_name_list.append(product_name)
try:
telephone = l.find_element(By.XPATH, "./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[2]").text
print(telephone)
telephone_list.append(telephone)
mobilephone = l.find_element(By.XPATH,
"./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[3]").text
print(mobilephone)
mobilephone_list.append(mobilephone)
except:
continue
company = l.find_element(By.XPATH,
"./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em").text
print(company)
company_name_list.append(company)
for link in l.find_elements(By.XPATH,"./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em/a"):
company_url = link.get_attribute('href')
print(company_url)
company_url_list.append(company_url)
print(keyword)
keyword_list.append(keyword)
except:
print("爬取失败")
for i in keywords:
this_page = 0
while NextPage(keyword = i, page = this_page) > 19:
GetResult(keyword=i, page=this_page)
this_page = this_page + 1
if NextPage(keyword = i, page = this_page) < 20:
GetResult(keyword=i, page=this_page)
data_list = []
for a, b, c, d, e, f in zip(keyword_list, product_name_list, company_name_list, company_url_list, mobilephone_list, telephone_list):
x = {}
x['keyword'] = a
x['product_name'] = b
x['company_name'] = c
x['company_url'] = d
x['mobilephone'] = e
x['telephone'] = f
data_list.append(x)
# print(data_list)
with open("###.csv", 'w', newline='', encoding='UTF-8') as f_c_csv:
writer = csv.writer(f_c_csv)
writer.writerow(['keyword', 'product_name','company_name', 'company_url', 'mobilephone', 'telephone'])
for nl in data_list:
writer.writerow(nl.values())
print("写入完成!")