字符串操作 · 机器学习

[TOC] # 大小写转换 ~~~ s = pd.Series(['A', 'b', 'B', 'gaer', 'AGER', np.nan]) # print(s.str.upper()) print(s.str.lower()) ~~~ 输出 ~~~ 0 a 1 b 2 b 3 gaer 4 ager 5 NaN dtype: object ~~~ # 计算长度 ~~~ s = pd.Series(['A', 'b', 'B', 'gaer', 'AGER', np.nan]) print(s.str.len()) ~~~ # 去空格 ~~~ index = pd.Index([' tang', ' yu ', 'di ']) print(index.str.strip()) print(index.str.lstrip()) print(index.str.rstrip()) ~~~ 输出 ~~~ Index(['tang', 'yu', 'di'], dtype='object') Index(['tang', 'yu ', 'di '], dtype='object') Index([' tang', ' yu', 'di'], dtype='object') ~~~ # 修改列名 ~~~ df = pd.DataFrame(np.random.randn(3, 2), columns=['A a', 'B b'], index=range(3)) print(df) df.columns = df.columns.str.replace(' ', '_') print(df) ~~~ 输出 ~~~ A a B b 0 0.529066 0.635890 1 1.643041 -0.540741 2 -0.066216 -0.076341 A_a B_b 0 0.529066 0.635890 1 1.643041 -0.540741 2 -0.066216 -0.076341 ~~~ # 切割字段 ~~~ s = pd.Series(['a_b_C', 'c_d_e', 'f_g_h']) print(s) print(s.str.split('_')) print(s.str.split('_', expand=True)) # 只切一次 print(s.str.split('_', expand=True, n=1)) ~~~ 输出 ~~~ 0 a_b_C 1 c_d_e 2 f_g_h dtype: object 0 [a, b, C] 1 [c, d, e] 2 [f, g, h] dtype: object 0 1 2 0 a b C 1 c d e 2 f g h 0 1 0 a b_C 1 c d_e 2 f g_h ~~~ # 是否包含某个元素 ~~~ s = pd.Series(['A', 'Aas', 'Afgew', 'Ager', 'Agre', 'Agre']) contains = s.str.contains('Aas') print(contains) ~~~ 输出 ~~~ 0 False 1 True 2 False 3 False 4 False 5 False dtype: bool ~~~ # 分割统计 ~~~ s = pd.Series(['a', 'a|b', 'a|c']) print(s) dummies = s.str.get_dummies(sep="|") print(dummies) ~~~ 输出 ~~~ 0 a 1 a|b 2 a|c dtype: object a b c 0 1 0 0 1 1 1 0 2 1 0 1 ~~~