创建爬虫----遍历单个域名爬取赶集网

爬取赶集网的页面链接

#赶集网首页静态页面的链接爬取
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html=urlopen("http://ty.ganji.com/")
bsObj=BeautifulSoup(html)

for link in bsObj.find("div",{"class":"content-col"}).findAll("a",href=re.compile("^[a-z]")):
   if 'href' in link.attrs:
      print("http://ty.ganji.com/"+link.attrs['href'])

创建爬虫----遍历单个域名爬取赶集网_第1张图片

#出现404HTTPError,修改请求头访问
from bs4 import BeautifulSoup
import requests
import re

session=requests.session()
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36",
         "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, * / *;q = 0.8"}
#def getlinks(articleUrl):
url="http://ty.ganji.com/fang1/"
req=session.get(url,headers=headers)
bsObj=BeautifulSoup(req.text)
for link in bsObj.find("div",{"class":"f-list js-tips-list"}).findAll("a",href=re.compile("^\/")):
     if 'href' in link.attrs:
         print("http://ty.ganji.com"+link.attrs['href'])

创建爬虫----遍历单个域名爬取赶集网_第2张图片

你可能感兴趣的:(python爬虫)