1銆乺equests.get/post(url,headers = headers, params = params/data=data)
#浣跨敤requests搴� 鏍规嵁缃戦〉get鎴杙ost璇锋眰锛堝瓨鍦ㄤ紶鍙傜殑璇漡et鐢╬arams,post鐢╠ata锛�,寰楀埌涓�涓猂esponse瀵硅薄
Response.text(鎵撳嵃鏂囨湰)
Response.encoding锛堟枃鏈紪鐮侊級琛ュ厖锛�decode锛堬級瑙g爜锛歞ecode('gbk')鎶婁竴涓�榞bk鈥欒浆unicode缂栫爜锛沞ncode锛堬級缂栫爜锛氭妸unicode杞叾浠栫紪鐮�
Response.content锛堜互瀛楄妭鐨勬柟寮忚闂姹傚弽搴旓級
Response.status_code(鍝嶅簲鐘舵�佺爜)
2銆亁path瑙f瀽
from lxml import etree(瀵煎寘)
html_page = etree.HTML(Response.content)
data = html_page.xpath('//select[@id="cx_province"]/option/@value')(xpath鎻愬彇)
3銆乥s4鎻愬彇
from bs4 import BeautifulSoup(瀵煎寘)
soup=BeautifulSoup(Response.content,鈥榣xml鈥�)
links = soup.find_all('a',class_='')#鏌ユ壘鎵�鏈塩las=鈥樷�欑殑a鏍囩
for聽 a in links:a.text(a鏍囩涓嬬殑鏂囨湰)锛宎.get('href')(a鏍囩鐨勮繛鎺�)
4銆佹鍒欐彁鍙�
銆傘�傘�傘��
5銆乯son鏁版嵁鎻愬彇
json.loads()鎴栬�卐val()
6銆佷繚瀛榮ession瀵硅瘽璁块棶鍜屼唬鐞唅p璁块棶
def ip_sql():
#瀛樺叆鏁版嵁搴�
db = MySQLdb.connect("192.168.201.91","root","123456","spidertools" )# 鎵撳紑鏁版嵁搴撹繛鎺�
cursor = db.cursor()# 浣跨敤cursor()鏂规硶鑾峰彇鎿嶄綔娓告爣
sql="SELECT * FROM proxyippool; "# 鍒涘缓鏁版嵁琛⊿QL璇彞
cursor.execute(sql)
results = cursor.fetchall()# 鑾峰彇鎵�鏈夎褰曞垪琛�
for row in results:
ip=row[0]+'://'+row[1]+':'+row[2]
ip_data.append(ip)
db.commit()
# 鍏抽棴鏁版嵁搴撹繛鎺�
db.close()
# return ip_data
def savesql_hotel(data,savepoint_name):
import sqlite3
聽 聽 #鍒涘缓鏁版嵁搴搒qlite3
聽 聽 con=sqlite3.connect(savepoint_name)
聽 聽 con.execute('''CREATE TABLE IF NOT聽 EXISTS data
聽 聽 聽 聽 (
聽 聽 聽 聽 hoteltype varchar(200) NOT NULL,
聽 聽 聽 聽 -- province varchar(200) NOT NULL,
聽 聽 聽 聽 city varchar(200) NOT NULL,
聽 聽 聽 聽 name varchar(1000) DEFAULT NULL,
聽 聽 聽 聽 address varchar(1000) DEFAULT NULL,
聽 聽 聽 聽 salesTel varchar(100) DEFAULT NULL,
聽 聽 聽 聽 lat varchar(100) DEFAULT NULL,
聽 聽 聽 聽 lng varchar(100) DEFAULT NULL,
聽 聽 聽 聽 Url聽 varchar(1000) DEFAULT NULL);''')
聽 聽 # 鎻掑叆鏁版嵁
聽 聽 sql='insert into data(hoteltype,city,name,address,salesTel,lat,lng,Url)\
聽 聽 values("%s","%s","%s","%s","%s","%s","%s","%s")'%(u'',data[0],data[1],data[2],data[3],data[4],data[5],data[6])
聽 聽 con.execute(sql)
聽 聽 con.commit()
聽 聽 con.close()
se = requests.session()
ip= random.sample(ip_data,1)[0]浠庡緱鍒扮殑浠g悊ip姹犱腑闅忔満鍙栧嚭涓�涓�
se.proxies = {'http': ip,'https': ip}
jstext=se.get('http://www.sicy......',headers=headers).content