1.
In [1]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
In [2]: s.str.lower()
Out[2]:
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
In [3]: s.str.upper()
Out[3]:
0 A
1 B
2 C
3 AABA
4 BACA
5 NaN
6 CABA
7 DOG
8 CAT
dtype: object
In [4]: s.str.len()
Out[4]:
0 1.0
1 1.0
2 1.0
3 4.0
4 4.0
5 NaN
6 4.0
7 3.0
8 3.0
dtype: float64
In [5]: idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank'])
In [6]: idx.str.strip()
Out[6]: Index([u'jack', u'jill', u'jesse', u'frank'], dtype='object')
In [7]: idx.str.lstrip()
Out[7]: Index([u'jack', u'jill ', u'jesse ', u'frank'], dtype='object')
In [8]: idx.str.rstrip()
Out[8]: Index([u' jack', u'jill', u' jesse', u'frank'], dtype='object')
In [9]: df = pd.DataFrame(randn(3, 2), columns=[' Column A ', ' Column B '],
...: index=range(3))
...:
In [10]: df
Out[10]:
Column A Column B
0 0.017428 0.039049
1 -2.240248 0.847859
2 -1.342107 0.368828
In [11]: df.columns.str.strip()
Out[11]: Index([u'Column A', u'Column B'], dtype='object')
In [12]: df.columns.str.lower()
Out[12]: Index([u' column a ', u' column b '], dtype='object')
In [13]: df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
In [14]: df
Out[14]:
column_a column_b
0 0.017428 0.039049
1 -2.240248 0.847859
2 -1.342107 0.368828
2.
In [15]: s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
In [16]: s2.str.split('_')
Out[16]:
0 [a, b, c]
1 [c, d, e]
2 NaN
3 [f, g, h]
dtype: object
In [17]: s2.str.split('_').str.get(1)
Out[17]:
0 b
1 d
2 NaN
3 g
dtype: object
In [18]: s2.str.split('_').str[1]
Out[18]:
0 b
1 d
2 NaN
3 g
dtype: object
In [19]: s2.str.split('_', expand=True)
Out[19]:
0 1 2
0 a b c
1 c d e
2 NaN None None
3 f g h
In [20]: s2.str.split('_', expand=True, n=1)
Out[20]:
0 1
0 a b_c
1 c d_e
2 NaN None
3 f g_h
In [21]: s2.str.rsplit('_', expand=True, n=1)
Out[21]:
0 1
0 a_b c
1 c_d e
2 NaN None
3 f_g h
In [22]: s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
....: '', np.nan, 'CABA', 'dog', 'cat'])
....:
In [23]: s3
Out[23]:
0 A
1 B
2 C
3 Aaba
4 Baca
5
6 NaN
7 CABA
8 dog
9 cat
dtype: object
In [24]: s3.str.replace('^.a|dog', 'XX-XX ', case=False)
Out[24]:
0 A
1 B
2 C
3 XX-XX ba
4 XX-XX ca
5
6 NaN
7 XX-XX BA
8 XX-XX
9 XX-XX t
dtype: object
3.
In [29]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,
....: 'CABA', 'dog', 'cat'])
....:
In [30]: s.str[0]
Out[30]:
0 A
1 B
2 C
3 A
4 B
5 NaN
6 C
7 d
8 c
dtype: object
In [31]: s.str[1]
Out[31]:
0 NaN
1 NaN
2 NaN
3 a
4 a
5 NaN
6 A
7 o
8 a
dtype: object
4.
In [32]: pd.Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)', expand=False)
Out[32]:
0 1
0 a 1
1 b 2
2 NaN NaN
In [33]: pd.Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)', expand=False)
Out[33]:
letter digit
0 a 1
1 b 2
2 NaN NaN
In [35]: pd.Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)', expand=True)
Out[35]:
0
0 1
1 2
2 NaN
In [36]: pd.Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)', expand=False)
Out[36]:
0 1
1 2
2 NaN
dtype: object
In [37]: s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"])
In [38]: s
Out[38]:
A11 a1
B22 b2
C33 c3
dtype: object
In [39]: s.index.str.extract("(?P[a-zA-Z])", expand=True)
Out[39]:
letter
0 A
1 B
2 C
In [40]: s.index.str.extract("(?P[a-zA-Z])", expand=False)
Out[40]: Index([u'A', u'B', u'C'], dtype='object', name=u'letter')
1 group >1 group
Index Index ValueError
Series Series DataFrame
5.
In [42]: s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
In [43]: s
Out[43]:
A a1a2
B b1
C c1
dtype: object
In [44]: two_groups = '(?P[a-z])(?P[0-9])'
In [45]: s.str.extract(two_groups, expand=True)
Out[45]:
letter digit
A a 1
B b 1
C c 1
In [46]: s.str.extractall(two_groups)
Out[46]:
letter digit
match
A 0 a 1
1 a 2
B 0 b 1
C 0 c 1
6.
In [56]: pattern = r'[a-z][0-9]'
In [57]: pd.Series(['1', '2', '3a', '3b', '03c']).str.contains(pattern)
Out[57]:
0 False
1 False
2 False
3 False
4 False
dtype: bool
7.
In [59]: s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
In [60]: s4.str.contains('A', na=False)
Out[60]:
0 True
1 False
2 False
3 True
4 False
5 False
6 True
7 False
8 False
dtype: bool
8.
In [61]: s = pd.Series(['a', 'a|b', np.nan, 'a|c'])
In [62]: s.str.get_dummies(sep='|')
Out[62]:
a b c
0 1 0 0
1 1 1 0
2 0 0 0
3 1 0 1
In [63]: idx = pd.Index(['a', 'a|b', np.nan, 'a|c'])
In [64]: idx.str.get_dummies(sep='|')
Out[64]:
MultiIndex(levels=[[0, 1], [0, 1], [0, 1]],
labels=[[1, 1, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1]],
names=[u'a', u'b', u'c'])