pyspider
url = "https://www.creditchina.gov.cn/xinxigongshi/shipinanquanjianduchoujian/jieguoxiangqing/index.html?id=62335&dataType=1"
# # print("aaaaaa")
# # html = requests.get(url, headers=headers, proxies=proxies )
# # html.encoding = "utf-8"
# #
# # content = pq(str(html.text))('div.content.clearfix > div > div > div.result-tab.result-tab1')
# # print(content)
- 路径
C:\Users\YScredit\AppData\Roaming\Python\Python35\site-packages\pyspider
- 打印页面
response.text()
- 制定数据库和表的编码格式
CREATE DATABASE dbtest CHARACTER SET utf8 COLLATE utf8_general_ci;
CREATE TABLE tbtest(
NAME VARCHAR(111),
TYPE VARCHAR(111),
num VARCHAR(111),
address VARCHAR(111),
TIME VARCHAR(111)
)CHARACTER SET utf8 COLLATE utf8_general_ci;
实例no.1
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
import pymysql
from six import itervalues
import time
import datetime
import re
class Handler(BaseHandler):
crawl_config = {
'itag': 'bzxr-fuzhou-0.7',
'time_out': 4000,
}
@every(minutes=24 * 60)
@config(age=12 * 60 * 60)
def on_start(self):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'fzszy.chinacourt.org',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
}
basic_url = 'http://fzszy.chinacourt.org/public/more.php?LocationID=0704000000'
self.crawl(url=basic_url, headers=headers, callback=self.basic_page)
@config(age=12 * 60 * 60)
def basic_page(self, response):
dic = {
'id': '',
'name': '',
'case_code': '',
'name_id': '',
'itype': '',
'card_num': '',
'business_entity': '',
'sex': '',
'age': '',
'address': '',
'execute_money_backup': '',
'unexecute_money_backup': '',
'reg_date': '',
'court_name': '',
'org_url': '',
'source': '福州法院网',
'case_id': '',
'exp': '1'
}
try:
basic_url = response.url
basic_trs = response.doc('tr.tr_odd td.td_line')
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'fzszy.chinacourt.org',
'Referer': basic_url,
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
}
for each_tr in basic_trs.items():
org_url = each_tr('a').attr.href
dic['org_url'] = org_url
self.crawl(url=org_url, headers=headers, callback=self.detail_page,
save=dic)
except Exception as e:
dic['exp'] = '0'
if dic['name'] != '':
yield dic
@config(priority=2)
@config(age=12 * 60 * 60)
def detail_page(self, response):
response.encoding = 'gbk'
dic = response.save
try:
detail_trs = response.doc(' span.detail_content > strong > table ')
detail_tr = detail_trs('tr:not(:first-child):not(:nth-child(2))')
for each_tr in detail_tr.items():
dic['id'] = each_tr('td:nth-child(1)').text().replace(' ', '')
dic['name'] = each_tr('td:nth-child(2)').text().replace(' ', '')
dic['name_id'] = each_tr('td:nth-child(3)').text().replace(' ', '')
dic['address'] = each_tr('td:nth-child(4)').text().replace(' ', '')
dic['execute_money_backup'] = each_tr('td:nth-child(5)').text().replace(' ', '')
yield dic
except Exception as e:
dic['exp'] = '0'
if dic['name'] != '':
yield dic
def on_result(self, result):
for i in result:
print(i + " " + result[i])
self.save_mysql(result)
if not result:
return
def save_mysql(self,item):
connect = pymysql.connect(host="192.168.59.128",
user="root",
password="123456",
db="demo1",
charset="utf8",
use_unicode=False
)
cursor = connect.cursor()
positionName = item["address"]
positionLink = item['name']
positionType = item['source']
peopleNumber = item['execute_money_backup']
workLocation = item['card_num']
publishTime = item['itype']
sql = "insert into tencent (positionName,positionLink,positionType," \
"peopleNumber,workLocation,publishTime)VALUES(%s,%s,%s,%s,%s,%s)"
lis = (positionName,positionLink,positionType,peopleNumber,workLocation,publishTime)
cursor.execute(sql,lis)
connect.commit()
cursor.close()
connect.close()
实例- 爬取腾讯招聘
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
import pymysql
from six import itervalues
import time
import datetime
import re
class Handler(BaseHandler):
crawl_config = {
'itag': 'bzxr-fuzhou-0.8',
'time_out': 4000,
'proxy': 'H21WNK49K6PFSR3P:[email protected]:9010'
}
@every(minutes=24 * 60)
@config(age=12 * 60 * 60)
def on_start(self):
basic_url = 'https://hr.tencent.com/position.php?&start=#a0'
self.crawl(url=basic_url, callback=self.basic_page, validate_cert=False)
@config(age=12 * 60 * 60)
def basic_page(self, response):
dic = {
'name': '',
'type': '',
'num': '',
'address': '',
'time': '',
}
baseUrl = "https://hr.tencent.com/position.php?&start={}#a"
baseUrl1 = "https://hr.tencent.com/position.php?&start=10#a"
countAll = int(response.doc(" div.pagenav> a:nth-child(10)").text())
num = 0
while num < countAll * 10:
url = baseUrl.format(num)
self.crawl(url=url, validate_cert=False, callback=self.detail_page)
num = num + 10
@config(priority=2)
@config(age=12 * 60 * 60)
def detail_page(self, response):
response.encoding = 'utf-8'
try:
detail_trs = response.doc('#position > div.left.wcont_b.box > table ')
detail_tr = detail_trs('tr:not(:first-child):not(:last-child)')
for each_tr in detail_tr.items():
print("aaaaaaaaaaaa")
dic = {
'name': '',
'type': '',
'num': '',
'address': '',
'time': '',
}
dic['name'] = each_tr('td:nth-child(1)').text().replace(' ', '')
if len(each_tr('td:nth-child(2)').text().replace(' ', '')) == 0:
dic['type'] = "null"
else:
dic['type'] = each_tr('td:nth-child(2)').text().replace(' ', '')
dic['num'] = each_tr('td:nth-child(3)').text().replace(' ', '')
dic['address'] = each_tr('td:nth-child(4)').text().replace(' ', '')
dic['time'] = each_tr('td:nth-child(5)').text().replace(' ', '')
yield dic
except Exception as e:
dic['exp'] = '0'
if dic['name'] != '':
yield dic
def on_result(self, result):
print(result)
if not result:
return
self.insert_text(tablename='tbtest', **result)
self.save_mysql(result)
def escape(self, string):
return '%s' % string
def insert_text(self, tablename=None, **values):
kwargs = {
'host': '192.168.59.128',
'user': 'root',
'passwd': '123456',
'db': 'dbtest',
'charset': 'utf8'
}
tablename = self.escape(tablename)
cnx = pymysql.connect(**kwargs)
cur = cnx.cursor()
if values:
_keys = ",".join(self.escape(k) for k in values)
_values = ",".join(['%s', ] * len(values))
print(list(itervalues(values)))
sql_query = "INSERT IGNORE INTO %s (%s) values (%s)" % (tablename, _keys, _values)
print(sql_query, list(itervalues(values)))
try:
if values:
print(cur.execute(sql_query, list(itervalues(values))))
cnx.commit()
return True
except Exception as e:
print(e)
return False
def save_mysql(self, item):
connect = pymysql.connect(host="192.168.59.128",
user="root",
password="123456",
db="dbtest",
charset="utf8",
use_unicode=False
)
cursor = connect.cursor()
name = item["name"]
type = item['type']
num = item['num']
address = item['address']
time = item['time']
print("aaaaaaaaaaaaaaa")
sql = "insert into tbtest (name,type,num," \
"address,time)VALUES(%s,%s,%s,%s,%s)"
lis = (name, type, num, address, time)
cursor.execute(sql, lis)
connect.commit()
cursor.close()
connect.close()