使用yield优雅抓取网页分页数据
在使用Python来抓取网页数据的时候,常常碰到分页数据,其中部分下一页按钮已经有具体的链接地址,而另外的可能是javascript来处理分页的。这要求同时能解析页面内容,又要采集下一页的url。怎样优雅用python写这样的代码呢?或者说怎样更pythonic?
下面分别给出部分代码实例
def get_next_page(obj): '''get next page content from a url or another content ''' error_occurred = False for retry2 in xrange(3): try: if isinstance(obj, (basestring, unicode)): resp = curr_session.get(obj, timeout=TIMEOUT, headers=headers, cookies=cookies, allow_redirects=True) content = resp.content save_html_content(obj, content) error_occurred = False else: content = obj soup = BeautifulSoup(content, features='html5lib', from_encoding="utf8") e_next_page = soup.find('a', text="下頁") break except: error_occurred = True time.sleep(2) if error_occurred: yield content return if e_next_page: next_url = "http://www.etnet.com.hk" + e_next_page.get('href') time.sleep(2) yield content for i in get_next_page(next_url): yield i else: yield content
def get_next_page(obj, page=1): '''get next page content from a url or another content ''' error_occurred = False for retry2 in xrange(3): try: if isinstance(obj, (basestring, unicode)): resp = curr_session.get(obj, timeout=TIMEOUT, headers=headers, cookies=cookies, allow_redirects=True) content = resp.content save_html_content(obj, content) hrefs = re.findall('industrysymbol=.*&market_id=[^;]+', content) if page == 1 and (not "sh=" in obj) and hrefs: reset_url = ("http://www.aastocks.com/tc/cnhk/market/industry" "/sector-industry-details.aspx?%s&page=1" % \ (hrefs[0].replace('sh=1', 'sh=0').replace('&page=', '') \ .replace("'", '').split()[0])) for next_page in get_next_page(reset_url): yield next_page return error_occurred = False else: content = obj soup = BeautifulSoup(content, features='html5lib', from_encoding="utf8") e_next_page = soup.find('td', text="下一頁 ") break except: error_occurred = True LOG.error(traceback.format_exc()) time.sleep(2) if error_occurred: yield content return if e_next_page: hrefs = re.findall('industrysymbol=.*&market_id=[^;]+', content) if hrefs: next_url = ("http://www.aastocks.com/tc/cnhk/market/industry/sector-industry" "-details.aspx?%s&page=%d" % \ (hrefs[0].replace('sh=1', 'sh=0') \ .replace('&page=', '').replace("'", '').split()[0], page+1)) time.sleep(2) yield content for next_page in get_next_page(next_url, page+1): yield next_page else: yield content
for curr_href in e_href: retry_interval = random.randint(MIN_INTERVAL_SECONDS_FOR_RETRIEVING, MAX_INTERVAL_SECONDS_FOR_RETRIEVING) time.sleep(retry_interval) contents = get_next_page(curr_href) for content in contents: get_page_data(content)