# -*- coding: utf-8 -*-
"""
Created on Sat Oct 22 17:57:13 2016
@author: hhxsym
买粮网搜索列表获
"""
import os
import requests
from bs4 import BeautifulSoup
inpath="C:\\Users\\hhxsym\\Desktop\\课程群Python爬虫"
inpath = unicode(inpath , "utf8")
os.chdir(inpath) #不做编码转换后,中文路径无法打开,更改
def get_search_list(keyword = None, page = 1):
url = 'http://www.mailiangwang.com/biz/list'
payload={'keyword':keyword, 'pageid':page}
response = requests.get(url,params=payload) # requests.get(url地址,关键字url参数)
print response.url #查看返回内容的url地址
print response.status_code #打印状态码
soup = BeautifulSoup(response.text,'lxml')
#print soup
names = soup.select('body > div.wrap > div.merchantList > div.p_dataList > div.p_dataItem > span.n1 > a')
#CSS样式, .后面跟的是CSS样式,找到所有符合条件的a标签的字符串列表
#names = soup.select('body > div.wrap > div.merchantList > div > div.p_dataItem > span.n1 > a') #这个也可以
#names = soup.select('body > div.wrap > div.merchantList > div.p_dataList > div:nth-child(2) > span.n1 > a') # 为啥这个不行,原因?
capitials = soup.select('body > div.wrap > div.merchantList > div.p_dataList > div.p_dataItem > span.n3')
adds = soup.select('body > div.wrap > div.merchantList > div.p_dataList > div.p_dataItem > span.n5')
categorys = soup.select('body > div.wrap > div.merchantList > div.p_dataList > div.p_dataItem > span.n6')
with open('data.txt', 'w') as f:
f.write('公司名称|注册资本|公司地址|主营品类\n') #写入标题行
for name, capitial, add, category in zip(names, capitials, adds, categorys):
name = name.get('title').strip() #剔除空格
capitial = capitial.text
add = add.text
category = category.text
data = [name, capitial, add, category+'\n']
#print '|'.join(data)
f.write('|'.join(data).encode('utf-8')) #写入数据行
#f.write('|'.join(data)) #会报错,UnicodeEncodeError,解决办法:编码为utf-8
print '写入成功!'
if __name__=='__main__':
get_search_list(u'玉米', 1)
#文本数据excel方法: 复制 -> 粘贴到excel -> 菜单栏"数据" -> 分列 -> ‘分隔符号’ ->……