python网络爬虫 开发第一天

python网络爬虫 开发第一天

    • 选用框架
    • 网页分类
    • 作用
    • 用正则表达式提取数据(正则简单应用)

选用框架

scrapy requests +beautifulsoup(待定)

scrapy为框架 添加requests和beautifulsoup库

scrapy基于twisted 异步I/O框架 性能非常优异

scrapy方便拓展 内置css和xpath selector比纯python的beautifulsuop快

网页分类

1.静态网页 事先生成好的 不会变
2.动态网页(webservice 向后台发起restapi请求 填充数据)

作用

1.搜素引擎
2.推荐引擎
3.机器学习数据样本
4.数据分析,舆论分析

用正则表达式提取数据(正则简单应用)

# -*- coding:utf-8 -*-
import re

#提取生日
line = "XXX出生于2009年2月3日"
line1 = "XXX出生于2009/2/3"
line2 = "XXX出生于2009-2-3"
line3 = "XXX出生于2009-02-03"
line4 = "XXX出生于2009-02"
regex_str = ".*出生于(\d{4}[年/-]\d{1,2}([月/-]\d{1,2}|[月/-]$|$))"
match_obj = re.match(regex_str,line)
if match_obj:
    print(match_obj.group(1))


#?非贪婪匹配模式从左开始 贪婪匹配是反向的
#^x 必须是x开头 .任意字符. *前面的字符重复任意多次
line_1 = "booooooooobooby123"
regex_str1 = "(^b.*)"
match_flag=re.match(regex_str1,line_1)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("First String is b")
    print(match_flag.group(1))
else:
    print("First String is not b")
    #print(match_flag.group(1))     # 字符串提取模式取出第一个括号中的数
print("------------Example_1 is End--------")

#$结尾字符 x$必须是用x结尾的
line_2 = "booooooooobooby123"
regex_str2 = "(.*3$)"
match_flag=re.match(regex_str2,line_2)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Last String is 3")
    print(match_flag.group(1))
else:
    print("Last String is not 3")
print("------------Example_2 is End--------")

#+代表出现至少一次
line_3 = "booooooooobooby123"
regex_str3 = ".*(b.+b).*"
match_flag=re.match(regex_str3,line_3)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Last String is 3")
    print(match_flag.group(1))
else:
    print("Last String is not 3")
print("------------Example_3 is End--------")

#.{2}任意字符出现两次 {2,}出现2次及以上 {2,5}出现2-5次
line_4 = "booooooooobooby123"
regex_str4 = ".*(b.{2}b).*"
match_flag=re.match(regex_str4,line_4)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Append bXXb")
    print(match_flag.group(1))
else:
    print("Not have bXXb")
print("------------Example_4 is End--------")

line_5 = "booooooooobooby123"
regex_str5 = ".*(b.{3,}b).*"
match_flag=re.match(regex_str5,line_5)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Append bX*2+b")
    print(match_flag.group(1))
else:
    print("Not have bX*2+b")
print("------------Example_5 is End--------")

#[acvf]满足任何一个 也可以写一个区间[0-9] 也可以[^1]不等于1 中括号中字符全部按照字符串处理
line_6 = "booooooooobooby123"
regex_str6 = ".*([abcde].+b).*"
match_flag=re.match(regex_str6,line_6)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Show String")
    print(match_flag.group(1))
else:
    print("Not have String")
print("------------Example_6 is End--------")

#| 或者关系
line_7 = "booooooooobooby123"
regex_str7 = ".*(booooooooobooby|booooooooobooby123).*"
match_flag=re.match(regex_str7,line_7)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Show String")
    print(match_flag.group(1))
else:
    print("Not have String")
print("------------Example_7 is End--------")

#\s代表空格 \S只要不为空格
line_8 = "b oo uuwyueoru"
regex_str8 = ".*(b\s.*.\su).*"
match_flag=re.match(regex_str8,line_8)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Show String")
    print(match_flag.group(1))
else:
    print("Not have String")
print("------------Example_8 is End--------")

#line = "你  好" regex_Str=(你\s好) 可以通过 line=你好 不能通过
#\S你很好 你嘻好 能通过 你 好不同通过
line_9 = "b oo uuwyueoru"
regex_str9 = ".*(b\S.*.\Su).*"
match_flag=re.match(regex_str9,line_9)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Show String")
    print(match_flag.group(1))
else:
    print("This string have blank")
print("------------Example_9 is End--------")

#\w=[0-9 A-Z a-z _] \W和\w相反
line_10 = "boouuwyueoru"
regex_str10 = ".*([a-z]{5,}).*"
match_flag=re.match(regex_str10,line_10)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Show String")
    print(match_flag.group(1))
else:
    print("This string have blank")
print("------------Example_10 is End--------")

#[\u4E00-\u9FA5]固定写法 只要出现汉字就提取
line_11 = "他们说快写一首情歌"
regex_str11 = ".*?([\u4E00-\u9FA5]+).*"
match_flag=re.match(regex_str11,line_11)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Show String")
    print(match_flag.group(1))
else:
    print("i cant find chinese")
print("------------Example_11 is End--------")

#line = "color is 黄色小狗"  regex_str = .*([\u4E00-\u9FA5]+小狗)" 可以提取出黄色/其他颜色小狗

#\d 代表的是数字的意思 .*?(\d+)
line_12 = "12345iwuyiwqy987689"
line_13 = "12345iwuyiwqy987689"
regex_str12 = ".*(\d+).*"
regex_str13 = ".*?(\d+).*"
match_flag=re.match(regex_str12,line_12)                          #参数(模式字符串,要匹配的字符串)
match_flag1=re.match(regex_str13,line_13)                          #参数(模式字符串,要匹配的字符串)
if match_flag:
    print("Show String")
    print(match_flag.group(1))
else:
    print("Number is not found")

if match_flag1:
    print("Show String")
    print(match_flag1.group(1))
else:
    print("Number is not found")
print("------------Example_12 is End--------")


line = "booooooooobooby123"
regex_str_1 = ".*?(b.*?b).*"
regex_strSpecialA = "(bobby|bobby123)"          #匹配两者当中任意一个
regex_strSpecialB= "((bobby|boobby)123)"        #匹配遵循从外向内匹配出123
regex_strSpecialC= "([abcd]oobby123)"           #首字符为abcd中的一个就返回
regex_str = ".*?(b.*?b).*"                      #成功
regex_str1 = "^b.*3$"                           #模式字符串
regex_str2 = ".*(b.*b).*"                      #从后向前匹配到bb
regex_str3 = " .*?(b.*b).*"                     #第一个b从前到后匹配 第二季b从后向前匹配
regex_str4 = ".*(b.+b).*"                      #b和b之间至少出现一个任意字符
match_obj2 = re.match(regex_str2,line)
match_obj3 = re.match(regex_str3,line)

# 传入模式字符串 传入匹配字符串 返回成功获得re模式匹配变量

你可能感兴趣的:(人生苦短,我用python)