【python爬虫】抓取炒股概念

非常感谢https://www.cnblogs.com/xin-xin/p/4297852.html。该系列讲解很详细。

另附上我写的抓取炒股概念代码。

采用火狐浏览器,F12,选取Network,解析一下传送的地址。


import urllib.request
import re
import requests

# def main():
#     # url = "http://www.iwencai.com/school/dictionary?qs=study_dictonary_stock"
#     # url='http://www.iwencai.com/yike/article-class-list?tagId=37'
#     url="http://www.iwencai.com/yike/detail/auid/716981f756614a79"
#     try:
#         data = urllib.request.urlopen(url).read()
#         content = data.decode('UTF-8')
#
#         # pattern = re.compile('
.*?(.*?)
.*?'
# # '
(.*?)
',
# # re.S) # pattern = re.compile('
*?
(.*?)
',
# re.S) # items = re.findall(pattern, content) # print(items[0]) # # for item in items: # # print(item[0],item[1]) # # except e: # print(e.code) # print(e.re) def main(): for i in range(1,300): con_list=getPage(i) for item in con_list: subUrl=item['URL'] concrete=getConcrete(subUrl) if concrete!=None : if len(concrete)!=0: print(item['title']) concrete=concrete.replace('
','') # concrete.replace(' ', '') print(concrete.replace(' ', '')) print("================================") def getConcrete(subUrl): concrete_url = "http://www.iwencai.com/" + subUrl # print(concrete_url) try: data = urllib.request.urlopen(concrete_url).read() content = data.decode('UTF-8') # pattern = re.compile('
(.*?)
'
,re.S) items = re.findall(pattern, content) if len(items)==0: pattern = re.compile('
(.*?)
'
,re.S) items = re.findall(pattern, content) return items[0] except: print("异常--------------") return [] #获取索引页面的内容 def getPage(pageIndex): siteURL="http://www.iwencai.com/yike/index-page-ajax/" url = siteURL + "?p=" + str(pageIndex)+"&filterTag=37" # request = urllib2.Request(url) # response = urllib2.urlopen(request) # return response.read().decode('gbk') # data = urllib.request.urlopen(url).read() # content = data.decode('gbk') # return content headers = { 'Referer': 'http://www.sse.com.cn/disclosure/credibility/supervision/inquiries/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } r = requests.get(url, headers=headers) return r.json()['list'] # print(r.json()['list'][1]['summ']) if __name__ == '__main__': main()

你可能感兴趣的:(python技能)