1
2
3
4
5
6
7
8
9
10
11
|
print
(
df
)
val
0
HF
-
Antartica
1
HF
-
America
2
HF
-
Asia
print
(
df
.
val
.
replace
(
{
'HF -'
:
'Hi'
}
,
regex
=
True
)
)
0
Hi
Antartica
1
Hi
America
2
Hi
Asia
Name
:
val
,
dtype
:
object
|
dataframe
1
2
3
4
5
6
7
|
>
df
=
pd
.
DataFrame
(
{
'Column'
:
[
'HF - Antartica'
,
'HF - America'
,
'HF - Asia'
]
}
)
>
df
.
replace
(
'HF'
,
'Hi Funny'
,
regex
=
True
)
Column
0
Hi
Funny
-
Antartica
1
Hi
Funny
-
America
2
Hi
Funny
-
Asia
|
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.replace.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.replace.html
字符串函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
# SELECT CONCAT(origin, ' to ', destination)
df
[
'origin'
]
.
str
.
cat
(
df
[
'destination'
]
,
sep
=
' to '
)
df
[
'origin'
]
.
str
.
strip
(
)
# TRIM(origin)
df
[
'origin'
]
.
str
.
len
(
)
# LENGTH(origin)
df
[
'origin'
]
.
str
.
replace
(
'a'
,
'b'
)
# REPLACE(origin, 'a', 'b')
# SELECT SUBSTRING(origin, 1, 1)
df
[
'origin'
]
.
str
[
0
:
1
]
# 使用 Python 字符串索引
# SELECT SUBSTRING_INDEX(domain, '.', 2)
# www.example.com -> www.example
df
[
'domain'
]
.
str
.
split
(
'.'
)
.
str
[
:
2
]
.
str
.
join
(
'.'
)
df
[
'domain'
]
.
str
.
extract
(
r
'^([^.]+\.[^.]+)'
)
s
.
str
.
split
(
''
)
#.split('')切割字符串('这里写分隔符')
s
.
str
.
replace
(
'A'
,
'Z'
)
#.replace()将字符串替换成另外的
s1
=
pd
.
Series
(
[
'a1'
,
'a2'
,
'a3'
,
'a4'
]
)
#创建一个新的序列
s1
.
str
.
extract
(
'([abc])(\d)'
)
#提取
s1
.
str
.
extract
(
'[ab](\d)'
)
#提取
s2
=
pd
.
Series
(
[
'a'
,
'B'
,
'c'
,
'd'
]
)
a_z
=
r
'[a-z]'
#正则表达式
s2
.
str
.
contains
(
a_z
)
#.contains()检测列表里面有没有包含指定的内容
|
pd.to_datetime
用于将各种日期字符串转换成标准的 datetime64
类型。日期类型的 Series 都会有一个 dt
属性,从中可以获取到有关日期时间的信息,具体请参考文档 Time Series / Date functionality。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# SELECT STR_TO_DATE(full_date, '%Y%m%d%H%i%s') AS `datetime`
df
[
'datetime'
]
=
pd
.
to_datetime
(
df
[
'full_date'
]
,
format
=
'%Y%m%d%H%M%S'
)
# SELECT DATE_FORMAT(`datetime`, '%Y-%m-%d')
df
[
'datetime'
]
.
dt
.
strftime
(
'%Y-%m-%d'
)
df
[
'datetime'
]
.
dt
.
month
# MONTH(`datetime`)
df
[
'datetime'
]
.
dt
.
hour
# HOUR(`datetime`)
# SELECT UNIX_TIMESTAMP(`datetime`)
df
[
'datetime'
]
.
view
(
'int64'
)
/
/
pd
.
Timedelta
(
1
,
unit
=
's'
)
.
value
# SELECT FROM_UNIXTIME(`timestamp`)
pd
.
to_datetime
(
df
[
'timestamp'
]
,
unit
=
's'
)
# SELECT `datetime` + INTERVAL 1 DAY
df
[
'datetime'
]
+
pd
.
Timedelta
(
1
,
unit
=
'D'
)
|