# -*- encoding: utf-8 -*-
# coding:utf-8
import xlrd
from xlwt import Workbook
from tempfile import TemporaryFile
import urllib.request
import xml.dom.minidom as minidom
import xlwt
#结果目录
inforst = '/home/yuhz/PycharmProjects/untitled2/POI/'
data = xlrd.open_workbook('/home/yuhz/PycharmProjects/untitled2/POI/city1.xls')
sheet=data.sheets()[0]
file_name = 'result.txt' # write result to this file
#keyword = '汽车修理' urllib.parse.quote(keyword)
url_amap = 'http://restapi.amap.com/v3/place/text?&keywords=&types=010800&city=370602&citylimit=true&&output=xml&offset=20&page=1&key=&extensions=base'
#facility_type = r'types=170300' # factory facilities
#region = r'city=120113' # beichen of tianjin
each_page_rec = 20 # results that displays in one page
which_pach = r'page=1' # display which page
xml_file = 'tmp.xml' # xml filen name
#写入Excel(定义Excel表头)
book = Workbook()
#sheet1 = book.add_sheet('Sheet 0')
#自动换行
style = xlwt.easyxf('align: wrap on')
# get html by url and save the data to xml file
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
try:
# open xml file and save data to it
with open(xml_file, 'wb') as xml_file_handle:
xml_file_handle.write(html)
except IOError as err:
print("IO error: " + str(err))
return -1
return 0
# phrase data from xml
def parseXML(index):
total_rec = 1 # record number
# open xml file and get data record
try:
with open(file_name, 'a') as file_handle:
dom = minidom.parse(xml_file)
root = dom.getElementsByTagName("response") # The function getElementsByTagName returns NodeList.
for node in root:
total_rec = node.getElementsByTagName("count")[0].childNodes[0].nodeValue
pois = node.getElementsByTagName("pois")
for poi in pois[0].getElementsByTagName('poi'):
name = poi.getElementsByTagName("name")[0].childNodes[0].nodeValue
try:
address = poi.getElementsByTagName("address")[0].childNodes[0].nodeValue
except IndexError:
address = ""
try:
tel = poi.getElementsByTagName("tel")[0].childNodes[0].nodeValue
except IndexError:
tel = ""
try:
pname = poi.getElementsByTagName("pname")[0].childNodes[0].nodeValue
except IndexError:
pname = ""
try:
cityname = poi.getElementsByTagName("cityname")[0].childNodes[0].nodeValue
except IndexError:
cityname = ""
try:
adname = poi.getElementsByTagName("adname")[0].childNodes[0].nodeValue
except IndexError:
adname = ""
location = poi.getElementsByTagName("location")[0].childNodes[0].nodeValue
#写入Excel
index = index + 1
row1 = sheet1.row(index)
row1.write(0, name, style)
row1.write(1, address, style)
row1.write(2, tel, style)
row1.write(3, pname, style)
row1.write(4, cityname, style)
row1.write(5, adname, style)
row1.write(6, location, style)
except IOError as err:
print
"IO error: " + str(err)
return total_rec
if __name__ == '__main__':
nrows = sheet.nrows
ncols = sheet.ncols
sheet2 = book.add_sheet('All Data')
row2 = sheet2.row(0)
row2.write(0, 'Owner')
row2.write(1, 'city')
row2.write(2, 'count')
sheet2.col(0).width = 10000
sheet2.col(1).width = 10000
sheet2.col(2).width = 10000
sheet2.col(3).width = 5000
total_record = 0
keywords = ''
owner =''
for i in range(nrows):
keywords = sheet.row(i)[1].value
owner = sheet.row(i)[0].value
sheet1 = book.add_sheet(keywords)
row1 = sheet1.row(0)
row1.write(0, 'CORPNAME')
row1.write(1, 'ADDRESS')
row1.write(2, 'TEL')
row1.write(3, 'pname')
row1.write(4, 'cityname')
row1.write(5, 'adname')
row1.write(6, 'location')
sheet1.col(0).width = 10000
sheet1.col(1).width = 10000
sheet1.col(2).width = 10000
sheet1.col(3).width = 5000
sheet1.col(4).width = 5000
sheet1.col(5).width = 5000
sheet1.col(6).width = 5000
index = 0
# for j in range(ncols):
url_amap = 'http://restapi.amap.com/v3/place/text?&keywords=&types=010800&city='+ urllib.parse.quote(
keywords) +'&citylimit=true&&output=xml&offset=20&page=1&key=&extensions=base'
if getHtml(url_amap) == 0:
print('parsing page 1 ... ...')
# parse the xml file and get the total record number
total_record_str = parseXML(index)
total_record = int(total_record_str)
if (total_record % each_page_rec) != 0:
page_number = total_record / each_page_rec + 2
else:
page_number = total_record / each_page_rec + 1
# retrive the other records
for each_page in range(2, int(page_number)):
index = index + 20
print('parsing page ' + str(each_page) + ' ... ...')
url_amap = url_amap.replace('page=' + str(each_page - 1), 'page=' + str(each_page))
getHtml(url_amap)
total_record_str = parseXML(index)
total_record = int(total_record_str)
if total_record == 0:
break
else:
print
'error: fail to get xml from amap'
# 保存Excel
row2 = sheet2.row(i+1)
row2.write(0, owner)
row2.write(1, keywords)
row2.write(2, total_record)
sheet2.col(0).width = 10000
sheet2.col(1).width = 10000
sheet2.col(2).width = 10000
book.save(inforst + 'result.xls')
book.save(TemporaryFile())
version1.0 支持跨sheet页获取数据,并且按照原sheet页顺序写入。
# -*- encoding: utf-8 -*-
# coding:utf-8
import xlrd
from xlwt import Workbook
from tempfile import TemporaryFile
import urllib.request
import xml.dom.minidom as minidom
import xlwt
#结果目录
inforst = 'C:/Users/玲玲/PycharmProjects/untitled/POI'
data = xlrd.open_workbook('C:/Users/玲玲/PycharmProjects/untitled/POI/city.xls')
#sheet=data.sheets()[0]
file_name = 'result.txt' # write result to this file
#keyword = '汽车修理' urllib.parse.quote(keyword)
url_amap = 'http://restapi.amap.com/v3/place/text?&keywords=&types=010800&city=370602&citylimit=true&&output=xml&offset=20&page=1&key=¥¥&extensions=base'
#facility_type = r'types=170300' # factory facilities
#region = r'city=120113' # beichen of tianjin
each_page_rec = 20 # results that displays in one page
which_pach = r'page=1' # display which page
xml_file = 'tmp.xml' # xml filen name
#写入Excel(定义Excel表头)
book = Workbook()
all_index = 0
#sheet1 = book.add_sheet('Sheet 0')
#自动换行
style = xlwt.easyxf('align: wrap on')
# get html by url and save the data to xml file
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
try:
# open xml file and save data to it
with open(xml_file, 'wb') as xml_file_handle:
xml_file_handle.write(html)
except IOError as err:
print("IO error: " + str(err))
return -1
return 0
# phrase data from xml
def parseXML(owner):
total_rec = 1 # record number
# open xml file and get data record
try:
with open(file_name, 'a') as file_handle:
dom = minidom.parse(xml_file)
root = dom.getElementsByTagName("response") # The function getElementsByTagName returns NodeList.
for node in root:
total_rec = node.getElementsByTagName("count")[0].childNodes[0].nodeValue
pois = node.getElementsByTagName("pois")
for poi in pois[0].getElementsByTagName('poi'):
name = poi.getElementsByTagName("name")[0].childNodes[0].nodeValue
try:
address = poi.getElementsByTagName("address")[0].childNodes[0].nodeValue
except IndexError:
address = ""
try:
tel = poi.getElementsByTagName("tel")[0].childNodes[0].nodeValue
except IndexError:
tel = ""
try:
pname = poi.getElementsByTagName("pname")[0].childNodes[0].nodeValue
except IndexError:
pname = ""
try:
cityname = poi.getElementsByTagName("cityname")[0].childNodes[0].nodeValue
except IndexError:
cityname = ""
try:
adname = poi.getElementsByTagName("adname")[0].childNodes[0].nodeValue
except IndexError:
adname = ""
location = poi.getElementsByTagName("location")[0].childNodes[0].nodeValue
#写入Excel
#index = index + 1
global all_index
all_index = all_index + 1
row1 = sheet1.row(all_index)
row1.write(0, name, style)
row1.write(1, address, style)
row1.write(2, tel, style)
row1.write(3, pname, style)
row1.write(4, cityname, style)
row1.write(5, adname, style)
row1.write(6, location, style)
row1.write(7,owner, style)
except IOError as err:
print
"IO error: " + str(err)
return total_rec
if __name__ == '__main__':
# 打开工作表
worksheets = data.sheet_names()
sheet2 = book.add_sheet('All Data')
row2 = sheet2.row(0)
row2.write(0, 'Owner')
row2.write(1, 'city')
row2.write(2, 'count')
row2.write(3, 'area')
sheet2.col(0).width = 10000
sheet2.col(1).width = 10000
sheet2.col(2).width = 10000
sheet2.col(3).width = 5000
summary_index = 1
total_record = 0
# 遍历所有sheet对象
for worksheet_name in worksheets:
sheet = data.sheet_by_name(worksheet_name)
nrows = sheet.nrows
ncols = sheet.ncols
keywords = ''
owner =''
pre_owner = ''
sheet1 = book.add_sheet(worksheet_name)
row1 = sheet1.row(0)
row1.write(0, 'CORPNAME')
row1.write(1, 'ADDRESS')
row1.write(2, 'TEL')
row1.write(3, 'pname')
row1.write(4, 'cityname')
row1.write(5, 'adname')
row1.write(6, 'location')
row1.write(7, 'owner')
sheet1.col(0).width = 10000
sheet1.col(1).width = 10000
sheet1.col(2).width = 10000
sheet1.col(3).width = 5000
sheet1.col(4).width = 5000
sheet1.col(5).width = 5000
sheet1.col(6).width = 5000
sheet1.col(7).width = 2000
all_index = 0
index_from = 0
for i in range(nrows):
#index = index + 1
keywords = sheet.row(i)[1].value
if owner.strip() == '':
pre_owner = pre_owner
else:
pre_owner = owner
#print(pre_owner)
#print(owner)
temp_owner = owner
if owner.strip() == '':
temp_owner = pre_owner
owner = sheet.row(i)[0].value
#sheet1 = book.add_sheet(keywords)
# for j in range(ncols):
url_amap = 'http://restapi.amap.com/v3/place/text?&keywords=&types=010800&city='+ urllib.parse.quote(
keywords) +'&citylimit=true&&output=xml&offset=20&page=1&key=*****&extensions=base'
if getHtml(url_amap) == 0:
print('parsing page 1 ... ...')
# parse the xml file and get the total record number
#print(index)
#total_record_str = parseXML(temp_owner) #如果该列有合并单元格.使用temp_owner
total_record_str = parseXML(owner) #如果该列没有合并单元格.使用owner
total_record = int(total_record_str)
#print(total_record_str)
if (total_record % each_page_rec) != 0:
page_number = total_record / each_page_rec + 2
else:
page_number = total_record / each_page_rec + 1
#index = index + 20
# retrive the other records
for each_page in range(2, int(page_number)):
#index = index + 20
print('parsing page ' + str(each_page) + ' ... ...')
url_amap = url_amap.replace('page=' + str(each_page - 1), 'page=' + str(each_page))
getHtml(url_amap)
#total_record_str = parseXML(temp_owner) #如果该列有合并单元格.使用temp_owner
total_record_str = parseXML(owner) #如果该列没有合并单元格.使用owner
total_record = int(total_record_str)
#print(index)
#print(total_record_str)
if total_record == 0:
break
else:
print
'error: fail to get xml from amap'
# 保存Excel
row2 = sheet2.row(summary_index)
#row2.write(0, temp_owner) #如果该列有合并单元格.使用temp_owner
row2.write(0, owner) #如果该列没有合并单元格.使用owner
row2.write(1, keywords)
row2.write(2, total_record)
row2.write(3, worksheet_name)
sheet2.col(0).width = 10000
sheet2.col(1).width = 10000
sheet2.col(2).width = 10000
sheet2.col(3).width = 5000
summary_index = summary_index + 1
book.save(inforst + 'result.xls')
book.save(TemporaryFile())