python爬虫初步

和http(超文本传输协议)

爬虫步骤:确认需求》寻找需求》发送请求》解析数据》存储数据

python爬虫初步

编码规范

#-*-coding:utf-8-*-  or  #coding =utf-8

python可以加入main函数来测试程序if __name__= “”__mian__“”;当函数执行时调用函数,(文件中要定义main函数)

文件中如果不定义此函数,编译器按顺序执行

注释#导入第三方模块modulebs4#网页解析re#正则表达式urllib requrets error#制订url,获取网页数据xlat#进行excel操作sqlite3#进行SQLite数据库操作

baseurl ="https://movie.douban.com/top250?start="# 链接、图片、

findlink = re.compile(r'')#创建正则表达式对象,表示规则(字符串模式)

findimg = re.compile(r'

findTitle = re.compile(r'(.*)')

findrating = re.compile(r'(.*)')

findjudege = re.compile(r'(\d*)人评价')

findinq = re.compile(r'(.*)')

findbd = re.compile(r'

(.*?)

',re.S)

# re.S表示忽视换行符,包括换行符

savepath ="豆瓣电影top250.xls"

datalist = []

def main():

# getdata(baseurl)

    datalist = getdata(baseurl)

saveData(datalist,savepath)

def getdata(baseurl):

for iin range(0,10):

url = baseurl+str(i*25)

html = askurl(url)

#逐一解析

        soup = BeautifulSoup(html,"html.parser")

for itemin soup.find_all("div",class_="item"):

data = []

item =str(item)

# print(item)

# break

            link = re.findall(findlink,item)[0]

data.append(link)

img = re.findall(findimg,item)[0]

data.append(img)

title = re.findall(findTitle,item)

if(len(title)==2):

ctitle = title[0]

data.append(ctitle)

otitle = title[1].replace("/","")

data.append(otitle)

else:

data.append(title[0])

data.append(' ')

rating = re.findall(findrating,item)[0]

data.append(rating)

jubge = re.findall(findjudege,item)[0]

data.append(jubge)

inq = re.findall(findinq,item)

if(len(inq) !=0):

inq = inq[0].replace("。","")

data.append(inq)

else:

data.append(" ")

bd = re.findall(findbd,item)[0]

bd=re.sub('(\s+)?'," ",bd)

bd = re.sub("/"," ",bd)

data.append(bd.strip())

datalist.append(data)

# print(datalist)

    return datalist

def askurl(url):

head = {

"User-Agent":"Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,likeGecko)Chrome/78.0.3904.108Safari/537.36"

    }

html =""

    try:

request = urllib.request.Request(url,headers=head)

response = urllib.request.urlopen(request)

html = response.read().decode("utf-8")

# print(html)

    except urllib.error.URLErroras e:

if hasattr(e,"code"):

print(e.code)

if hasattr(e,"reason"):

print(e.reason)

return  html

def saveData(datalist,savepath):

book = xlwt.Workbook(encoding="utf-8",style_compression=0)

sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)

col = ("电影链接","1","1","2","1","2","1","2")

for iin range(8):

sheet.write(0,i,col[i])

for iin range(250):

print("第%d条"%i)

data = datalist[i]

for jin range(8):

sheet.write(i+1, j, data[j])

book.save(savepath)

if __name__ =="__main__":

main()





你可能感兴趣的:(python爬虫初步)