# -*- coding: utf-8 -*-
"""
Created on Mon May 20 10:46:27 2019
@author: Administrator
"""
#! /usr/bin/env python
#coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import urllib
import urllib.request
import sys
import os
import re
import csv
import numpy as np
# 解决中文报错的问题
csvfile = open("C:\\Users\\Administrator\\data1.csv", 'w+')
writer = csv.writer(csvfile)
writer.writerow(('a企业名称', 'b主页', 'c产品', 'd联系人', 'e电话', 'f地址','地区','经营范围'))
#url = 'https://s.1688.com/company/company_search.htm?keywords=%BB%AF%B9%A4&n=y&spm=a260k.635.1998096057.d1'
#url = 'https://s.1688.com/company/company_search.htm?n=y&pageSize=30&keywords=%BB%AF%B9%A4&offset=3&beginPage=20'
#url = 'https://s.1688.com/company/company_search.htm?keywords=%CA%B1%D7%B0&button_click=top&earseDirect=false&n=y&sortType=pop&pageSize=30&offset=3&beginPage=80'
#url = 'https://s.1688.com/company/company_search.htm?keywords=%BC%D2%BE%D3&button_click=top&earseDirect=false&n=y'
#url = 'https://s.1688.com/company/company_search.htm?n=y&pageSize=30&keywords=%B5%E7%C6%F8&offset=3&beginPage=55'
url = 'https://s.1688.com/company/company_search.htm?keywords=%B3%E8%CE%EF&button_click=top&earseDirect=false&n=y&netType=1%2C11'
loginUrl = 'https://login.taobao.com/member/login.jhtml?style=b2b&from=b2b&newMini=true&full_redirect=true&redirect_url=https%3A%2F%2Flogin.1688.com%2Fmember%2Fjump.htm%3Ftarget%3Dhttps%253A%252F%252Flogin.1688.com%252Fmember%252FmarketSigninJump.htm%253FDone%253Dhttp%25253A%25252F%25252Fmember.1688.com%25252Fmember%25252Foperations%25252Fmember_operations_jump_engine.htm%25253Ftracelog%25253Dlogin%252526operSceneId%25253Dafter_pass_from_taobao_new%252526defaultTarget%25253Dhttp%2525253A%2525252F%2525252Fwork.1688.com%2525252F%2525253Ftracelog%2525253Dlogin_target_is_blank_1688®=http%3A%2F%2Fmember.1688.com%2Fmember%2Fjoin%2Fenterprise_join.htm%3Flead%3Dhttp%253A%252F%252Fmember.1688.com%252Fmember%252Foperations%252Fmember_operations_jump_engine.htm%253Ftracelog%253Dlogin%2526operSceneId%253Dafter_pass_from_taobao_new%2526defaultTarget%253Dhttp%25253A%25252F%25252Fwork.1688.com%25252F%25253Ftracelog%25253Dlogin_target_is_blank_1688%26leadUrl%3Dhttp%253A%252F%252Fmember.1688.com%252Fmember%252Foperations%252Fmember_operations_jump_engine.htm%253Ftracelog%253Dlogin%2526operSceneId%253Dafter_pass_from_taobao_new%2526defaultTarget%253Dhttp%25253A%25252F%25252Fwork.1688.com%25252F%25253Ftracelog%25253Dlogin_target_is_blank_1688%26tracelog%3Dmember_signout_signin_s_reg'
driver = webdriver.Firefox()
time.sleep(3)
driver.get(loginUrl)
time.sleep(5)
driver.find_element_by_name("TPL_username").send_keys('账号')
driver.find_element_by_name("TPL_password").send_keys('密码')
driver.find_element_by_name("TPL_password").send_keys(Keys.ENTER)
time.sleep(5)
driver.get(url)
time.sleep(15)
user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
referer='https://s.1688.com/company/company_search.htm?keywords=%B3%E8%CE%EF&netType=1%2C11&earseDirect=false&button_click=top&n=y&pageSize=30&offset=3&beginPage=2'
cookie='cna=Y5rxE+PKc1oCAXrrzxugRBX9; __last_loginid__=wuqi356; lid=wuqi356; last_mid=b2b-181889983; ali_apache_track=c_mid=b2b-181889983|c_lid=wuqi356|c_ms=1; _cn_slid_=fhUkekh54F; ali_beacon_id=115.198.246.136.155382792318.290921.6; hng=CN%7Czh-CN%7CCNY%7C156; ali_ab=115.192.175.104.1557106627179.0; unb=181889983; __wapcsf__=1; cookie2=53c6bc8fc0e2aaa9bab424e391439235; t=3bc0828126b6ffa0710eaf546c2279f5; _tb_token_=7e33ee36e75e6; __cn_logon__=false; _csrf_token=1558313884528; h_keys="%u5ba0%u7269#%u4e94%u91d1#%u8bf7%u5e16"; ad_prefer="2019/05/20 10:42:10"; alicnweb=touch_tb_at%3D1558331385489%7Clastlogonid%3Dwuqi356%7ChomeIdttS%3D01469649499088246736339992082871925097%7ChomeIdttSAction%3Dtrue; l=bBgewTxevH6LWlG6BOfwZuI8zM_OoIRb8sPzw4Mg6ICPOv5M5yzAWZtV4OLHC3GVa6fkJ3J4RKM8B0Tn-y4EC; isg=BBISwTZDmk88guEVCSe4feVLY9g0ixA254i04txrvkWw77PpxbI8zEFBXwv2n45V'
for page in range(1, 100):
try:
title = driver.find_elements_by_css_selector(".list-item-title-text")
product = driver.find_elements_by_xpath("//div[@class=\"list-item-detail\"]/div[1]/div[1]/a[1]")
pattern = re.compile('(.*?)', re.S)
telPattern = re.compile('(.*?) ', re.S) #电话
membernamePattern = re.compile('(.*?)', re.S)
addressPattern = re.compile('"address">(.*?)', re.S) #
campanyPattern=re.compile('(.*?)', re.S) #地区
shopPattern=re.compile('(.*?)', re.S)
for i in range(len(title)):
titleValue = title[i].get_attribute('title')
hrefValue = title[i].get_attribute('href') + 'page/contactinfo.htm' #网页地址
productValue = product[i].text
headers = {'User-Agent' : user_agent, 'Accept' : '*/*', 'Referer' : referer,'Cookie':cookie}
request = urllib.request.Request(hrefValue, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('GBK')
# encode_type = chardet.detect(html)
#html = html.decode(encode_type['utf-8'])
#print(html)
info = re.findall(pattern, html)
try:
info = info[0]
except Exception:
continue
tel = re.findall(telPattern, info)
try:
tel = tel[0]
tel = tel.strip()
tel = tel.replace(' ', '-')
print(tel)
except Exception:
continue
membername = re.findall(membernamePattern, html)
try:
membername = membername[0]
membername = membername.strip()
except Exception:
continue
address = re.findall(addressPattern, html)
try:
address = address[0]
address = address.strip()
except Exception:
address = ''
campany = re.findall(campanyPattern, html)
try:
#print(campany)
campany = campany[-1]
campany = campany.strip()
except Exception:
continue
shop = re.findall(shopPattern, html)
try:
shop = shop[0]
shop = shop.strip()
except Exception:
continue
print ('tel:' + tel)
print ('membername:' + membername)
print ('shop:' + shop)
print ('campany:' + campany)
data = (titleValue, title[i].get_attribute('href'), productValue, membername, tel, address,campany,shop)
writer.writerow(data)
for t in title:
print( t.get_attribute('title'))
print (t.get_attribute('href') + 'page/contactinfo.htm')
print (len(product))
for p in product:
a = repr(p.text)
a = a.encode('gbk', 'ignore')
print (a)
print ('#' * 50)
js = 'var q=document.documentElement.scrollTop=30000'
driver.execute_script(js)
time.sleep(2)
page = driver.find_elements_by_css_selector("a[class=page-next]")
page = page[0]
page.click()
time.sleep(20)
except Exception:
print ('error')
continue
csvfile.close()
driver.close()
爬取1688商家信息