# -*- coding:utf-8 -*- import requests import Debug from bs4 import BeautifulSoup import re class VisitWeb(object): def __init__(self): self.session = requests.session() self.soup = None def visit(self, url): r = self.session.get(url) self.soup = BeautifulSoup(r.text, 'lxml') def find_all(self, name): links = self.soup.findAll(name) for i in range(0, len(links) - 1): s = str(links[i]) regex_str = ".*?([\u4E00-\u9FA5])" match_obj = re.findall(regex_str, s) name = ''.join(match_obj) if s.find('href') != -1: temp = str(links[i]['href']) if temp.find("taobao") != -1: Debug.print_debug_info(str(i) + ": " + name + " " + links[i]['href']) if __name__ == '__main__': visit = VisitWeb() url = 'https://www.taobao.com/' visit.visit(url) visit.find_all('a')