python爬虫实战-----利用selenium爬取表格一

这是我之前在一家公司做爬虫实习的时候写的,内容是爬取携程网站上境外酒店房间的各种信息,然后再存入sql server中。现在与大家分享!开头公司带我的老师给我的任务是:建立几个数据库表,内容涉及到携程网中所有境外酒店的房间信息。我首先将他给的信息建了表格。接着就思考怎么编程,我所用的语言是python。由于信息主要在网站的表格中,所以我觉得采用selenium库为宜。但是,selenium库中webdriver的速度过慢,在下个文章中,我将解决这一问题。我们先看看下面这些代码!
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import json
from selenium import webdriver
import pyodbc
cnxn = pyodbc.connect("DRIVER={SQL Server};SERVER=WIN-20160105DRP;DATABASE=CtripData;UID=sa;PWD=123")
cursor1=cnxn.cursor()
sql1="SELECT ID,HotelCode,URL from DownHotelListLive where dstatus=1"
cursor1.execute(sql1)
allselect =cursor1.fetchall()
cursor1.close()
for oneselect in allselect:
url_CN = oneselect.URL
Driver_CN = webdriver.Chrome()
Driver_CN.get(url_CN)
Html_CN = Driver_CN.page_source
Html_Change = BeautifulSoup(Html_CN)
Hroom_list_cn = Html_Change.findAll("div", {"class": "hroom_list"})[0]
url_EN = 'https://www.trip.com/hotels/london-hotel-detail-' + str(oneselect.HotelCode) + '/'
Driver_EN = webdriver.Chrome()
Driver_EN.get(url_EN)
Html_EN = Driver_EN.page_source
Html_Change_EN = BeautifulSoup(Html_EN)
HTags_list=list()
if Html_Change.find("div",{"class":"cont"}).findAll("div",{"class":"cont_in"})[0].find("div",{"class":"cont_main"}).find("div",{"class":"htl_info_com"}).find("div",{"class":"htl_info"}).findAll("div")[0].find("div",{"class":"htl_info_tags"}).findAll("span"):
AllHTags_html=Html_Change.find("div",{"class":"cont"}).findAll("div",{"class":"cont_in"})[0].find("div",{"class":"cont_main"}).find("div",{"class":"htl_info_com"}).find("div",{"class":"htl_info"}).findAll("div")[0].find("div",{"class":"htl_info_tags"}).findAll("span")
for oneHTags_html in AllHTags_html:
oneHTags_cuthtml=str(oneHTags_html)[32:]
oneHTags_cuthtml=oneHTags_cuthtml[:-7]
HTags_list.append(oneHTags_cuthtml)
HTags_str=','.join(HTags_list)
name_html=Html_Change.find("div",{"class":"cont"}).findAll("div",{"class":"cont_in"})[0].find("div",{"class":"cont_main"}).find("div",{"class":"htl_info_com"}).find("div",{"class":"htl_info"}).findAll("div")[0].find("h1",{"class":"name"})
nameEN_html=str(name_html.find("span",{"class":"ename"}))[20:]
nameEN=nameEN_html[:-7].strip()
if str(name_html)[35].isalpha():
nameCN=''
else:
nameCN=str(name_html)[33:]
for nameCN_index in range(len(nameCN)):
if nameCN[nameCN_index]=='<':
nameCN_for_index=nameCN_index
break
nameCN=nameCN[:nameCN_for_index].strip()
location_html=Html_Change.find("div",{"class":"cont"}).find("div",{"class":"path_bar2"}).find("div",{"itemprop":"breadcrumb"}).findAll("a")
country_html=str(location_html[1])[:-6]
city_html=str(location_html[2])[:-6]
area_html=str(location_html[-1])[:-4]
for country_str_index in range(len(country_html)):
if country_html[len(country_html)-1-country_str_index]=='>':
country_html=country_html[len(country_html)-country_str_index:]
break
for city_str_index in range(len(city_html)):
if city_html[len(city_html)-1-city_str_index]=='>':
city_html=city_html[len(city_html)-city_str_index:]
break
for area_str_index in range(len(area_html)):
if area_html[len(area_html)-1-area_str_index]=='>':
area_html=area_html[len(area_html)-area_str_index:]
break
if '酒店' in area_html:
area_html=area_html[:-2]
if '酒店' in city_html:
city_html=city_html[:-2]
address_html =Html_Change.find("div", {"class": "cont"}).findAll("div", {"class": "cont_in"})[0].find("div", {"class": "cont_main"}).find("div", {"class": "htl_info_com"}).find("div", {"class": "htl_info"}).findAll("div")[0].find("div",{"class":"adress"}).find("span",{"class":"address_text"})
address_EN=str(address_html)[27:]
address_EN=address_EN[:-7]
location_value=Html_Change.find("input",{"id":"hotelCoordinate"}).get("value")
location_value=location_value.split('|')
latitude_value=location_value[0]
longitude_value=location_value[1]
if Html_Change.find("div", {"class": "cont"}).findAll("div", {"class": "cont_in"})[0].find("div",{"class":"cont_aside"}).find("div",{"class":"cmt_summary c-2"}):
comment_num_html=Html_Change.find("div", {"class": "cont"}).findAll("div", {"class": "cont_in"})[0].find("div",{"class":"cont_aside"}).find("div",{"class":"cmt_summary c-2"}).find("div",{"class":"cmt_summary_hd"}).find("a",{"id":"commnet_score"})
comment_num=str(comment_num_html)[:-12]
for comment_num_index in range(len(comment_num)):
if comment_num[len(comment_num)-1-comment_num_index]=='>':
comment_num=comment_num[len(comment_num)-comment_num_index:]
break
else:
comment_num=0
price=Html_Change.find("div", {"class": "cont"}).findAll("div", {"class": "cont_in"})[0].find("div", {"class": "cont_main"}).find("div", {"class": "htl_info_com"}).find("div", {"class": "htl_info"}).find("div",{"class":"price_box"}).find("div",{"class":"J_price_info"}).find("div",{"class":"staring_price"}).find("div",{"class":"detail_price"}).find("span",{"class":"price"})
price=str(price)[20:]
price=price[:-7]
if price.isdigit()==False:
price=''

cursor = cnxn.cursor()
sql="INSERT INTO CtripHotelList([HCode],[HotelTag],[NameEN],[NameCN],[CountryName],[CityName],[AreaName],[AddrEN],[Latitude],[Longitude],[DianpingNum],[PriceInfo],[SourceURL],[UpdateDate],[AddDate]) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%d','%s','%s',getdate(),getdate())" %(str(oneselect.HotelCode),HTags_str,nameEN,nameCN,country_html,city_html,area_html,address_EN,latitude_value,longitude_value,int(comment_num),price,oneselect.URL)
cursor.execute(sql)
cnxn.commit()
cursor.close()
cursor1 = cnxn.cursor()
sql1 = "SELECT Hid from CtripHotelList where HCode='%s'" %str(oneselect.HotelCode)
cursor1.execute(sql1)
selectforHid = cursor1.fetchone()[0]
cursor1.close()
hotel_type_list = Html_Change_EN.findAll("div", {"class": "m-hotel-type", "id": "room_table"})[0].findAll("div",{"class": "hotel-type__list"})
roomid_list = list()
for hotel_type in hotel_type_list:
# c5+=i6.find("table").find("tbody").findAll("")
roomid_list.append(int(hotel_type.find("table").get("data-roomid")))
# a20=int(len(c5)/4)
# for i4 in range(a20):
# b3.append(c5[4*i4+1])
#
# print(b3)
allhroomlist = Hroom_list_cn.findAll("div", {"class": "hroom_tr J_baseRoomlist "})
for allhroomlist_index in range(len(allhroomlist)):
hroominfo = allhroomlist[allhroomlist_index].findAll("div", {"class": "hroom_col hroom_col_type"})[0].find("dl", {"class": "hroom_base"})
hroominfo_text = hroominfo.find("dd", {"class": "hroom_base_txt J_hroom_base_detail"}).get("data-text")
hroominfo_json = json.loads(str(hroominfo_text))
roomId = int(hroominfo_json['comment_baseroomId'])
roomName = hroominfo_json['comment_baseroomName']
subRoomlist = allhroomlist[allhroomlist_index].find("div", {"class": "hroom_tr_cols"}).findAll("div", {"class": "hroom_tr_col J_subRoomlist"})
freewifi_num = 0
nonsmoking_num = 0
for subRoomlistfor_one in subRoomlist:
hroom_col_network = str(subRoomlistfor_one.find("div", {"class": "hroom_col hroom_col_network"}))
if '免费' in hroom_col_network:
freewifi_num += 1
if '吸烟信息' in hroom_col_network and '不可吸烟' not in hroom_col_network:
nonsmoking_num += 1
Tname_list = list()
Tinfo_list = list()
if hroominfo_json['thisBaseRoomServiceDetailList'] != []:
RoomServiceDetailList = hroominfo_json['thisBaseRoomServiceDetailList']
for RoomServiceDetail in RoomServiceDetailList:
Tname_list.append(RoomServiceDetail['thisDetailInfoName'])
Tinfo_list.append(RoomServiceDetail['thisDetailInfoVal'])
Addbed_info = ''
Area_info = ''
if Tname_list != []:
if '可加床' in Tname_list:
Addbed_info = Tinfo_list[Tname_list.index('可加床')]
if '建筑面积' in Tname_list:
Area_info = Tinfo_list[Tname_list.index('建筑面积')]
cursor2 = cnxn.cursor()
sql2 = "insert into CtripHotelRoomList([Hid],[RoomCode],[RoomNameCN],[FreeWifi],[SmokingInfo],[AreaInfo],[AddBedInfo],[Language],[UpdateDate],[AddDate]) values('%d','%d','%s','%d','%d','%s','%s','%s',getdate(),getdate());" % (selectforHid, roomId, roomName, freewifi_num, nonsmoking_num, Area_info, Addbed_info, "中文")
cursor2.execute(sql2)
cnxn.commit()
cursor2.close()

for hotel_type_index in range(0, len(hotel_type_list)):
h_type = hotel_type_list[hotel_type_index].find("td", {"class": "h-type"})
Rcode = roomid_list[hotel_type_index]

RnameEN = str((h_type.find("div", {"class": "h-type__cnt"})).find("a", {"class": "h-type__name is-link"}))[:-4]
for RnameEN_index in range(len(RnameEN)):
if RnameEN[RnameEN_index] == '>':
RnameEN_forindex = RnameEN_index + 1
break
RnameEN = RnameEN[RnameEN_forindex:]
wifi_smoking_info = hotel_type_list[hotel_type_index].find("td", {"colspan": "6"})
freewifi_num_EN = int((str(wifi_smoking_info).count('Free')) / 2)
nonsmoking_num_EN = int(str(wifi_smoking_info).count('Non-smoking'))
area_info = ''
if str(h_type.find("div", {"class": "h-type__cnt"}).find("ul", {"class": "o-fi-txt"}).find("li")):
area_info_html = str(h_type.find("div", {"class": "h-type__cnt"}).find("ul", {"class": "o-fi-txt"}).find("li"))
area_info = area_info_html[50:]
area_info = area_info[:-5]
cursor3 = cnxn.cursor()
sql3 = "insert into CtripHotelRoomList([Hid],[RoomCode],[RoomNameEN],[FreeWifi],[SmokingInfo],[AreaInfo],[UpdateDate],[AddDate],[Language]) values('%d','%d','%s','%d','%d','%s',getdate(),getdate(),'%s');" % (selectforHid, Rcode, RnameEN, freewifi_num_EN, nonsmoking_num_EN,area_info, "英文")
cursor3.execute(sql3)
cnxn.commit()
cursor3.close()

if Html_Change.findAll("div", {"class": "htl_room_txt text_3l J_tabHeightConShift_1"}) != None:
intro_html = Html_Change.findAll("div", {"class": "htl_room_txt text_3l J_tabHeightConShift_1"})[0].find("div")
intro = str(intro_html)
intro = intro[5:]
intro = intro[:-6]
intro=intro.replace('
', '')
intro=intro.replace(' ', '')
cursor4 = cnxn.cursor()
sql4 = "insert into CtripHotelDesc([Hid],[Language],[Intro],[AddDate],[UpdateDate]) values('%d','%s','%s',getdate(),getdate());" % (selectforHid, "中文", intro)
cursor4.execute(sql4)
cnxn.commit()
cursor4.close()

hbrief_html = Html_Change_EN.find("div",{"class":"p-hotel-details"}).find("div",{"class":"l-inner"}).find("div", {"class": "m-hotel-brief"}).find("div", {"class": "brief-wrapper"})
hbrief_html_part1 = hbrief_html.find("p", {"class": "brief-prompt"}).findAll("strong")
hbrief_html_part2 = hbrief_html.find("div", {"class": "brief-cnt"})
hbrief_html_part2=str(hbrief_html_part2)[45:]
hbrief_html_part2=hbrief_html_part2[:-6]
hbrief_list=list()
for one_hbrief_html_part1 in hbrief_html_part1:
one_hbrief_html_part1=str(one_hbrief_html_part1)[8:]
one_hbrief_html_part1=one_hbrief_html_part1[:-9].strip()
hbrief_list.append(one_hbrief_html_part1)
hbrief_list='|'.join(hbrief_list)
hbrief = hbrief_list+'|'+hbrief_html_part2
hbrief=hbrief.replace('
','')
hbrief=hbrief.replace('
', '')
hbrief =hbrief.replace('>', '')
hbrief=hbrief.replace(' ','')
hbrief.replace("'", "''")
cursor9 = cnxn.cursor()
sql9 = "insert into CtripHotelDesc([Hid],[Language],[Intro],[UpdateDate],[AddDate]) values('%d','%s','%s',getdate(),getdate());" % (int(selectforHid), "英文", hbrief)
print(sql9)
cursor9.execute(sql9)
cnxn.commit()
cursor9.close()
htl_info_table = Html_Change.findAll("div", {"class": "htl_info_table detail_con_2 J_tabHeightConShift_1"})[0]
htl_info_table_txt = htl_info_table.find("table").find("tbody")
htl_info = htl_info_table_txt.findAll("tr")
tname_list = list()
tinfo_list = list()
if len(htl_info) > 4:
length_htl_info = len(htl_info) - 1
else:
length_htl_info = len(htl_info)
for length_htl_info_index in range(length_htl_info):
one_tname = str(htl_info[length_htl_info_index].find('th'))[4:]
one_tname = one_tname[:-5]
tname_list.append(one_tname)
tinfo_html_list = (htl_info[length_htl_info_index].findAll("td")[0]).findAll("ul")[0].findAll("li")
tinfo_cuthtml_list = list()
for one_tinfo_html_list in tinfo_html_list:
tinfo_cut2 = ''
tinfo_cut1 = ''
tinfo_cut3 = ''
for one_tinfo_html_list_index in range(50, len(str(one_tinfo_html_list))):
if str(one_tinfo_html_list)[one_tinfo_html_list_index] == '<':
tinfo_cut1 = str(one_tinfo_html_list)[50:one_tinfo_html_list_index]
break
if one_tinfo_html_list.find("span"):
for one_tinfo_html_list_index_1 in range(len(str(one_tinfo_html_list.find("span")))):
if str(one_tinfo_html_list.find("span"))[one_tinfo_html_list_index_1] == '>':
tinfo_cut2 = str(one_tinfo_html_list.find("span"))[one_tinfo_html_list_index_1 + 1:]
tinfo_cut2 = tinfo_cut2[:-7]
break
if len(str(one_tinfo_html_list)) > 80 and one_tinfo_html_list.find("span"):
for one_tinfo_html_list_index_2 in range(78, len(str(one_tinfo_html_list))):
if str(one_tinfo_html_list)[one_tinfo_html_list_index_2] == '<':
tinfo_cut3 = str(one_tinfo_html_list)[78:one_tinfo_html_list_index_2]
break

join_tinfo = tinfo_cut1.strip() + tinfo_cut2.strip() + tinfo_cut3.strip()
tinfo_cut2 = ''
tinfo_cut1 = ''
tinfo_cut3 = ''
tinfo_cuthtml_list.append(join_tinfo)
join_tinfo_change = str(','.join(tinfo_cuthtml_list))

tinfo_list.append(join_tinfo_change)
tname = '|'.join(tname_list)
tinfo = '|'.join(tinfo_list)
cursor5 = cnxn.cursor()
sql5 = "insert into CtripHotelBookInfo([Hid],[Language],[TName],[TInfo],[UpdateDate],[AddDate]) values('%d','%s','%s','%s',getdate(),getdate());" % (selectforHid, "中文", tname, tinfo)
cursor5.execute(sql5)
cnxn.commit()
cursor5.close()

hotel_facility_normal = Html_Change_EN.find("div", {"class": "c-hotel-facility"}).find("div", {"class": "c-hotel-facility__wrapper"}).find("div",{"class": "c-hotel-facility__normal"}).findAll("div", {"class": "c-hotel-facility__normal-item u-clearfix"})
tname_list_EN = list()
tinfo_list_EN = list()
for hotel_facility_normal_index_1 in range(len(hotel_facility_normal)):
tinfo_list_EN_list_part = list()
tname_html = str(hotel_facility_normal[hotel_facility_normal_index_1].find("div", {"class": "c-hotel-facility__normal-cnt"}).find("p"))[:-4]
for tname_html_index in range(len(tname_html)):
if tname_html[len(tname_html) - 1 - tname_html_index] == '>':
tname_html_forindex = len(tname_html) - tname_html_index
break
tname_list_EN.append(tname_html[tname_html_forindex:])
tinfo_html = hotel_facility_normal[hotel_facility_normal_index_1].find("div", {"class": "c-hotel-facility__normal-cnt"}).findAll("li",{"class":"u-power"})
for tinfo_html_index in range(len(tinfo_html)):
tinfo_html_cut = str(tinfo_html[tinfo_html_index].find("span"))[6:]
tinfo_html_cut_1 = tinfo_html_cut[:-7]
tinfo_list_EN_list_part.append(tinfo_html_cut_1)
tinfo_list_EN_part = ','.join(tinfo_list_EN_list_part)
tinfo_list_EN.append(tinfo_list_EN_part)
tinfo_list_EN.append('|')
join_tname_EN = '|'.join(tname_list_EN)
tinfo_list_EN =tinfo_list_EN[:-1]
join_tinfo_EN = ''.join(tinfo_list_EN)
cursor10 = cnxn.cursor()
sql10 = "insert into CtripHotelBookInfo([Hid],[Language],[TName],[TInfo],[UpdateDate],[AddDate]) values('%d','%s','%s','%s',getdate(),getdate());" % (selectforHid, "英文", join_tname_EN, join_tinfo_EN)
cursor10.execute(sql10)
cnxn.commit()
cursor10.close()
if Html_Change.findAll("div", {"class": "group_brand htl_room_txt text_3l"}):
htl_room_txt = Html_Change.findAll("div", {"class": "group_brand htl_room_txt text_3l"})[0]
brand_html = htl_room_txt.find("p").find("b")
brand_html_cut = str(brand_html)[3:]
brand_html_cut = brand_html_cut[:-4]
for htl_room_txt_index in range(11, len(str(htl_room_txt))):
if str(htl_room_txt)[len(str(htl_room_txt)) - htl_room_txt_index - 1] == '>':
intro_brand = str(htl_room_txt)[0 - htl_room_txt_index:-12]
break
cursor6 = cnxn.cursor()
sql6 = "insert into CtripHotelBrand([Language],[BrandName],[Intro],[UpdateDate],[AddDate]) values('%d','%s','%s',getdate(),getdate());" % ("中文", brand_html_cut, intro_brand)
cursor6.execute(sql6)
cnxn.commit()
cursor6.close()
Html_Change = Html_Change.findAll("div", {"class": "hroom_list"})[0]
Room_list = Html_Change.findAll("div", {"class": "hroom_tr J_baseRoomlist "})
for Room_list_index in range(len(Room_list)):
hroom_base = Room_list[Room_list_index].findAll("div", {"class": "hroom_col hroom_col_type"})[0].find("dl", {"class": "hroom_base"})
RoomInfoDetailsList_text = hroom_base.find("dd", {"class": "hroom_base_txt J_hroom_base_detail"}).get("data-text")
RoomInfoDetailsList_json = json.loads(RoomInfoDetailsList_text)
Rinfoname_list = list()
Rinfoval_list = list()
if RoomInfoDetailsList_json['thisBaseRoomRoomInfoDetailsList'] != []:
RoomInfoDetailsList_json_list = RoomInfoDetailsList_json['thisBaseRoomRoomInfoDetailsList']

for RoomInfoDetailsList_json_list_index in RoomInfoDetailsList_json_list:
RInfoVal = ','.join(RoomInfoDetailsList_json_list_index['thisDetailInfoVal'])
Rinfoname_list.append(RoomInfoDetailsList_json_list_index['thisDetailInfoName'])
Rinfoval_list.append(RInfoVal)
Rinfoval_list.append('|')
roomid = int(RoomInfoDetailsList_json['comment_baseroomId'])
roomname = RoomInfoDetailsList_json['comment_baseroomName']
if RoomInfoDetailsList_json['thisBaseRoomServiceDetailList'] != []:
ServiceDetailList = RoomInfoDetailsList_json['thisBaseRoomServiceDetailList']
for ServiceDetailList_index in ServiceDetailList:
InfoVal_part = ServiceDetailList_index['thisDetailInfoVal']
Rinfoname_list.append(ServiceDetailList_index['thisDetailInfoName'])
Rinfoval_list.append(InfoVal_part)
Rinfoval_list.append('|')
Rinfoval_list = Rinfoval_list[:-1]
infoname = str('|'.join(Rinfoname_list))
infoval = str(''.join(Rinfoval_list))
cursor7 = cnxn.cursor()
sql7 = "SELECT Hid,Rid from CtripHotelRoomList where RoomCode = '%d'" % (roomid)
cursor7.execute(sql7)
hid = cursor7.fetchone()[0]
rid = cursor7.fetchone()[1]
cursor7.close()
cursor8 = cnxn.cursor()
sql8 = "insert into CtripHotelFacilities([Hid],[Rid],[Language],[TName],[TInfo],[RoomCode],[UpdateDate],[AddDate]) values('%d','%d','%s','%s','%s','%d',getdate(),getdate());" % (hid, rid, "中文", infoname, infoval, roomid)
cursor8.execute(sql8)
cnxn.commit()
cursor8.close()

print(rid)
cursor11 = cnxn.cursor()
updateDstatus_Sql = "update downhotellistLive set DStatus=4 where id='%d'" % oneselect.ID
cursor11.execute(updateDstatus_Sql)
cnxn.commit()
cursor11.close()
Driver_CN.quit()
Driver_EN.quit()

转载于:https://www.cnblogs.com/kanziliang/p/9438132.html

你可能感兴趣的:(python爬虫实战-----利用selenium爬取表格一)