dataframe,Series 字符串 替换部分字符串 ,字符串处理

 

Python
print (df) val 0 HF - Antartica 1 HF - America 2 HF - Asia print (df.val.replace({'HF -':'Hi'}, regex=True)) 0 Hi Antartica 1 Hi America 2 Hi Asia Name: val, dtype: object
1
2
3
4
5
6
7
8
9
10
11
print ( df )
               val
0    HF - Antartica
1      HF - America
2        HF - Asia
 
print ( df . val . replace ( { 'HF -' : 'Hi' } , regex = True ) )
0      Hi Antartica
1        Hi America
2          Hi Asia
Name : val , dtype : object

dataframe

Python
>df = pd.DataFrame({'Column': ['HF - Antartica', 'HF - America', 'HF - Asia']}) >df.replace('HF', 'Hi Funny', regex=True) Column 0 Hi Funny - Antartica 1 Hi Funny - America 2 Hi Funny - Asia
1
2
3
4
5
6
7
> df = pd . DataFrame ( { 'Column' : [ 'HF - Antartica' , 'HF - America' , 'HF - Asia' ] } )
> df . replace ( 'HF' , 'Hi Funny' , regex = True )
 
                 Column
0    Hi Funny - Antartica
1      Hi Funny - America
2        Hi Funny - Asia

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.replace.html

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.replace.html

字符串函数

Python
# SELECT CONCAT(origin, ' to ', destination) df['origin'].str.cat(df['destination'], sep=' to ') df['origin'].str.strip() # TRIM(origin) df['origin'].str.len() # LENGTH(origin) df['origin'].str.replace('a', 'b') # REPLACE(origin, 'a', 'b') # SELECT SUBSTRING(origin, 1, 1) df['origin'].str[0:1] # 使用 Python 字符串索引 # SELECT SUBSTRING_INDEX(domain, '.', 2) # www.example.com -> www.example df['domain'].str.split('.').str[:2].str.join('.') df['domain'].str.extract(r'^([^.]+\.[^.]+)') s.str.split('')#.split('')切割字符串('这里写分隔符') s.str.replace('A','Z')#.replace()将字符串替换成另外的 s1 = pd.Series(['a1','a2','a3','a4'])#创建一个新的序列 s1.str.extract('([abc])(\d)')#提取 s1.str.extract('[ab](\d)')#提取 s2 = pd.Series(['a','B','c','d']) a_z = r'[a-z]'#正则表达式 s2.str.contains(a_z)#.contains()检测列表里面有没有包含指定的内容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# SELECT CONCAT(origin, ' to ', destination)
df [ 'origin' ] . str . cat ( df [ 'destination' ] , sep = ' to ' )
 
df [ 'origin' ] . str . strip ( ) # TRIM(origin)
df [ 'origin' ] . str . len ( ) # LENGTH(origin)
df [ 'origin' ] . str . replace ( 'a' , 'b' ) # REPLACE(origin, 'a', 'b')
 
# SELECT SUBSTRING(origin, 1, 1)
df [ 'origin' ] . str [ 0 : 1 ] # 使用 Python 字符串索引
 
# SELECT SUBSTRING_INDEX(domain, '.', 2)
# www.example.com -> www.example
df [ 'domain' ] . str . split ( '.' ) . str [ : 2 ] . str . join ( '.' )
df [ 'domain' ] . str . extract ( r '^([^.]+\.[^.]+)' )
 
s . str . split ( '' ) #.split('')切割字符串('这里写分隔符')
 
s . str . replace ( 'A' , 'Z' ) #.replace()将字符串替换成另外的
s1 = pd . Series ( [ 'a1' , 'a2' , 'a3' , 'a4' ] ) #创建一个新的序列
s1 . str . extract ( '([abc])(\d)' ) #提取
s1 . str . extract ( '[ab](\d)' ) #提取
 
s2 = pd . Series ( [ 'a' , 'B' , 'c' , 'd' ] )
a_z = r '[a-z]' #正则表达式
s2 . str . contains ( a_z ) #.contains()检测列表里面有没有包含指定的内容

日期函数

pd.to_datetime 用于将各种日期字符串转换成标准的 datetime64 类型。日期类型的 Series 都会有一个 dt 属性,从中可以获取到有关日期时间的信息,具体请参考文档 Time Series / Date functionality。

Python
# SELECT STR_TO_DATE(full_date, '%Y%m%d%H%i%s') AS `datetime` df['datetime'] = pd.to_datetime(df['full_date'], format='%Y%m%d%H%M%S') # SELECT DATE_FORMAT(`datetime`, '%Y-%m-%d') df['datetime'].dt.strftime('%Y-%m-%d') df['datetime'].dt.month # MONTH(`datetime`) df['datetime'].dt.hour # HOUR(`datetime`) # SELECT UNIX_TIMESTAMP(`datetime`) df['datetime'].view('int64') // pd.Timedelta(1, unit='s').value # SELECT FROM_UNIXTIME(`timestamp`) pd.to_datetime(df['timestamp'], unit='s') # SELECT `datetime` + INTERVAL 1 DAY df['datetime'] + pd.Timedelta(1, unit='D')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# SELECT STR_TO_DATE(full_date, '%Y%m%d%H%i%s') AS `datetime`
df [ 'datetime' ] = pd . to_datetime ( df [ 'full_date' ] , format = '%Y%m%d%H%M%S' )
 
# SELECT DATE_FORMAT(`datetime`, '%Y-%m-%d')
df [ 'datetime' ] . dt . strftime ( '%Y-%m-%d' )
 
df [ 'datetime' ] . dt . month # MONTH(`datetime`)
df [ 'datetime' ] . dt . hour # HOUR(`datetime`)
 
# SELECT UNIX_TIMESTAMP(`datetime`)
df [ 'datetime' ] . view ( 'int64' ) / / pd . Timedelta ( 1 , unit = 's' ) . value
 
# SELECT FROM_UNIXTIME(`timestamp`)
pd . to_datetime ( df [ 'timestamp' ] , unit = 's' )
 
# SELECT `datetime` + INTERVAL 1 DAY
df [ 'datetime' ] + pd . Timedelta ( 1 , unit = 'D' )



你可能感兴趣的:(dataframe,Series 字符串 替换部分字符串 ,字符串处理)