第08章数据清理 · Pandas Cookbook 带注释源码

# 第08章数据清理 ```py In[1]: import pandas as pd import numpy as np ``` ## 1\. 用stack清理变量值作为列名 ```py # 加载state_fruit数据集 In[2]: state_fruit = pd.read_csv('data/state_fruit.csv', index_col=0) state_fruit out[2]: ``` ![](https://img.kancloud.cn/b2/e1/b2e1172e9860ca68f79e5dfc7b2fd75f_327x163.png) ```py # stack方法可以将所有列名，转变为垂直的一级行索引 In[3]: state_fruit.stack() out[3]: Texas Apple 12 Orange 10 Banana 40 Arizona Apple 9 Orange 7 Banana 12 Florida Apple 0 Orange 14 Banana 190 dtype: int64 ``` ```py # 使用reset_index()，将结果变为DataFrame In[4]: state_fruit_tidy = state_fruit.stack().reset_index() state_fruit_tidy out[4]: ``` ![](https://img.kancloud.cn/16/54/16549e9d1252d9b9f3c7aefe3781748b_247x397.png) ```py # 重命名列名 In[5]: state_fruit_tidy.columns = ['state', 'fruit', 'weight'] state_fruit_tidy out[5]: ``` ![](https://img.kancloud.cn/80/df/80df3542e80fb4e9551da5084f6f8f1f_266x393.png) ```py # 也可以使用rename_axis给不同的行索引层级命名 In[6]: state_fruit.stack()\ .rename_axis(['state', 'fruit'])\ out[6]: state fruit Texas Apple 12 Orange 10 Banana 40 Arizona Apple 9 Orange 7 Banana 12 Florida Apple 0 Orange 14 Banana 190 dtype: int64 ``` ```py # 再次使用reset_index方法 In[7]: state_fruit.stack()\ .rename_axis(['state', 'fruit'])\ .reset_index(name='weight') out[7]: ``` ![](https://img.kancloud.cn/de/a0/dea0316ca1762db391a93782793df67f_276x405.png) ### 更多 ```py # 读取state_fruit2数据集 In[8]: state_fruit2 = pd.read_csv('data/state_fruit2.csv') state_fruit2 out[8]: ``` ![](https://img.kancloud.cn/18/7c/187c6c409c0585829f61bda7590d3bfd_1108x528.png) ```py # 州名不在行索引的位置上，使用stack将所有列名变为一个长Series In[9]: state_fruit2.stack() out[9]: 0 State Texas Apple 12 Orange 10 Banana 40 1 State Arizona Apple 9 Orange 7 Banana 12 2 State Florida Apple 0 Orange 14 Banana 190 dtype: object ``` ```py # 先设定state作为行索引名，再stack，可以得到和前面相似的结果 In[10]: state_fruit2.set_index('State').stack() out[10]: 0 State Texas Apple 12 Orange 10 Banana 40 1 State Arizona Apple 9 Orange 7 Banana 12 2 State Florida Apple 0 Orange 14 Banana 190 dtype: object ``` ## 2\. 用melt清理变量值作为列名 ```py # 读取state_fruit2数据集 In[11]: state_fruit2 = pd.read_csv('data/state_fruit2.csv') state_fruit2 out[11]: ``` ![](https://img.kancloud.cn/3b/7c/3b7cdff8bbdf806ae06fbcc7c45ae1b2_550x268.png) ```py # 使用melt方法，将列传给id_vars和value_vars。melt可以将原先的列名作为变量，原先的值作为值。 In[12]: state_fruit2.melt(id_vars=['State'], value_vars=['Apple', 'Orange', 'Banana']) out[12]: ``` ![](https://img.kancloud.cn/b5/65/b56584ace13aa251d267f9bfadf40a49_836x1304.png) ```py # 随意设定一个行索引 In[13]: state_fruit2.index=list('abc') state_fruit2.index.name = 'letter' In[14]: state_fruit2 out[14]: ``` ![](https://img.kancloud.cn/da/ec/daecbb0a3b90e3b5cda264f509e3aeff_1208x668.png) ```py # var_name和value_name可以用来重命名新生成的变量列和值的列 In[15]: state_fruit2.melt(id_vars=['State'], value_vars=['Apple', 'Orange', 'Banana'], var_name='Fruit', value_name='Weight') out[15]: ``` ![](https://img.kancloud.cn/bc/20/bc20c0625dfeb9c01e923d66a186404d_860x1340.png) ```py # 如果你想让所有值都位于一列，旧的列标签位于另一列，可以直接使用melt In[16]: state_fruit2.melt() out[16]: ``` ![](https://img.kancloud.cn/bb/54/bb54a3668ad9f1425117e6bba21d7eed_334x830.png) ```py # 要指明id变量，只需使用id_vars参数 In[17]: state_fruit2.melt(id_vars='State') out[17]: ``` ![](https://img.kancloud.cn/7e/96/7e96d23321a827975077a4d5d43e7f0c_420x650.png) ## 3\. 同时stack多组变量 ```py # 读取movie数据集，选取所有演员名和其Facebook likes In[18]: movie = pd.read_csv('data/movie.csv') actor = movie[['movie_title', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']] actor.head() out[18]: ``` ![](https://img.kancloud.cn/d8/75/d8753737344cf61c6c2d6e60776624c8_1586x357.png) ```py # 创建一个自定义函数，用来改变列名。wide_to_long要求分组的变量要有相同的数字结尾： In[19]: def change_col_name(col_name): col_name = col_name.replace('_name', '') if 'facebook' in col_name: fb_idx = col_name.find('facebook') col_name = col_name[:5] + col_name[fb_idx - 1:] + col_name[5:fb_idx-1] return col_name In[20]: actor2 = actor.rename(columns=change_col_name) actor2.head() out[20]: ``` ![](https://img.kancloud.cn/d5/40/d540b4b347ee4b572b0600ceefa0bf28_1188x411.png) ```py # 使用wide_to_long函数，同时stack两列actor和Facebook In[21]: stubs = ['actor', 'actor_facebook_likes'] actor2_tidy = pd.wide_to_long(actor2, stubnames=stubs, i=['movie_title'], j='actor_num', sep='_').reset_index() actor2_tidy.head() out[21]: ``` ![](https://img.kancloud.cn/5c/e8/5ce8918099abe7dc2efa38477ee87cd2_870x247.png) ### 更多 ```py # 加载数据 In[22]: df = pd.read_csv('data/stackme.csv') df out[22]: ``` ![](https://img.kancloud.cn/2a/6b/2a6baf3f9fb011e2eaf5726319fe6b4b_402x168.png) ```py # 对列重命名 In[23]: df2 = df.rename(columns = {'a1':'group1_a1', 'b2':'group1_b2', 'd':'group2_a1', 'e':'group2_b2'}) df2 out[23]: ``` ![](https://img.kancloud.cn/8e/b8/8eb8fb424c6fd65d3d748681f57c7e5f_688x167.png) ```py # 设定stubnames=['group1', 'group2']，对任何数字都起作用 In[24]: pd.wide_to_long(df2, stubnames=['group1', 'group2'], i=['State', 'Country', 'Test'], j='Label', suffix='.+', sep='_') out[24]: ``` ![](https://img.kancloud.cn/93/05/93057e59c20a30a29800c8a24fe98f29_442x320.png) ## 4\. 反转stacked数据 ```py # 读取college数据集，学校名作为行索引，，只选取本科生的列 In[25]: usecol_func = lambda x: 'UGDS_' in x or x == 'INSTNM' college = pd.read_csv('data/college.csv', index_col='INSTNM', usecols=usecol_func) college.head() out[25]: ``` ![](https://img.kancloud.cn/70/26/7026c6a9aa9c55257ac96eba5892f335_1202x547.png) ```py # 用stack方法，将所有水平列名，转化为垂直的行索引 In[26]: college_stacked = college.stack() college_stacked.head(18) out[26]: INSTNM Alabama A & M University UGDS_WHITE 0.0333 UGDS_BLACK 0.9353 UGDS_HISP 0.0055 UGDS_ASIAN 0.0019 UGDS_AIAN 0.0024 UGDS_NHPI 0.0019 UGDS_2MOR 0.0000 UGDS_NRA 0.0059 UGDS_UNKN 0.0138 University of Alabama at Birmingham UGDS_WHITE 0.5922 UGDS_BLACK 0.2600 UGDS_HISP 0.0283 UGDS_ASIAN 0.0518 UGDS_AIAN 0.0022 UGDS_NHPI 0.0007 UGDS_2MOR 0.0368 UGDS_NRA 0.0179 UGDS_UNKN 0.0100 dtype: float64 ``` ```py # unstack方法可以将其还原 In[27]: college_stacked.unstack().head() out[27]: ``` ![](https://img.kancloud.cn/cf/f6/cff61d06a58c6a64ac7e16038f7261e1_1218x494.png) ```py # 另一种方式是先用melt，再用pivot。先加载数据，不指定行索引名 In[28]: college2 = pd.read_csv('data/college.csv', usecols=usecol_func) college2.head() out[28]: ``` ![](https://img.kancloud.cn/75/ec/75ec5772d0b90f60c170618b93d97faa_1200x510.png) ```py # 使用melt，将所有race列变为一列 In[29]: college_melted = college2.melt(id_vars='INSTNM', var_name='Race', value_name='Percentage') college_melted.head() out[29]: ``` ![](https://img.kancloud.cn/32/9a/329ad7436c6dad3c2d85ec318b023b1d_603x243.png) ```py # 用pivot还原 In[30]: melted_inv = college_melted.pivot(index='INSTNM', columns='Race', values='Percentage') melted_inv.head() out[30]: ``` ![](https://img.kancloud.cn/07/00/0700ffa35aa3ecfc7ca676c561ff63d1_1201x546.png) ```py # 用loc同时选取行和列，然后重置索引，可以获得和原先索引顺序一样的DataFrame In[31]: college2_replication = melted_inv.loc[college2['INSTNM'], college2.columns[1:]]\ .reset_index() college2.equals(college2_replication) out[31]: True ``` ### 更多 ```py # 使用最外层的行索引做unstack In[32]: college.stack().unstack(0) out[32]: ``` ![](https://img.kancloud.cn/a2/82/a2827b19a0e56c4a671a54f5f65a2f5d_1210x592.png) ```py # 转置DataFrame更简单的方法是transpose()或T In[33]: college.T out[33]: ``` ![](https://img.kancloud.cn/64/ee/64ee141cc98ec641bcb7731157a37cb4_1221x494.png) ## 5\. 分组聚合后unstacking ```py # 读取employee数据集，求出每个种族的平均工资 In[34]: employee = pd.read_csv('data/employee.csv') In[35]: employee.groupby('RACE')['BASE_SALARY'].mean().astype(int) out[35]: RACE American Indian or Alaskan Native 60272 Asian/Pacific Islander 61660 Black or African American 50137 Hispanic/Latino 52345 Others 51278 White 64419 Name: BASE_SALARY, dtype: int64 ``` ```py # 对种族和性别分组，求平均工资 In[36]: agg = employee.groupby(['RACE', 'GENDER'])['BASE_SALARY'].mean().astype(int) agg out[36]: RACE GENDER American Indian or Alaskan Native Female 60238 Male 60305 Asian/Pacific Islander Female 63226 Male 61033 Black or African American Female 48915 Male 51082 Hispanic/Latino Female 46503 Male 54782 Others Female 63785 Male 38771 White Female 66793 Male 63940 Name: BASE_SALARY, dtype: int64 ``` ```py # 对索引层GENDER做unstack In[37]: agg.unstack('GENDER') out[37]: ``` ![](https://img.kancloud.cn/58/f7/58f7eb30118a4fd406af74f6242d7480_475x315.png) ```py # 对索引层RACE做unstack In[38]: agg.unstack('RACE') out[38]: ``` ![](https://img.kancloud.cn/6f/67/6f675598baa12e1ed9086fc51dbfe58b_1162x165.png) ### 更多 ```py # 按RACE和GENDER分组，求工资的平均值、最大值和最小值 In[39]: agg2 = employee.groupby(['RACE', 'GENDER'])['BASE_SALARY'].agg(['mean', 'max', 'min']).astype(int) agg2 out[39]: ``` ![](https://img.kancloud.cn/cd/02/cd02acb31c6b509c778bc9119caef4a8_632x552.png) ```py # 此时unstack('GENDER')会生成多级列索引，可以用stack和unstack调整结构 agg2.unstack('GENDER') ``` ![](https://img.kancloud.cn/c3/83/c383d220457ad5dd6745e5c38ba47058_1070x504.png) ## 6\. 用分组聚合实现透视表 ```py # 读取flights数据集 In[40]: flights = pd.read_csv('data/flights.csv') flights.head() out[40]: ``` ![](https://img.kancloud.cn/8b/75/8b75b2ceda9b79adc8fb084f95f1669d_1202x279.png) ```py # 用pivot_table方法求出每条航线每个始发地的被取消的航班总数 In[41]: fp = flights.pivot_table(index='AIRLINE', columns='ORG_AIR', values='CANCELLED', aggfunc='sum', fill_value=0).round(2) fp.head() out[41]: ``` ![](https://img.kancloud.cn/00/f7/00f76cd0c2e0c14c8e86ad1a309a5da8_657x284.png) ```py # groupby聚合不能直接复现这张表。需要先按所有index和columns的列聚合 In[42]: fg = flights.groupby(['AIRLINE', 'ORG_AIR'])['CANCELLED'].sum() fg.head() out[42]: AIRLINE ORG_AIR AA ATL 3 DEN 4 DFW 86 IAH 3 LAS 3 Name: CANCELLED, dtype: int64 ``` ```py # 再使用unstack，将ORG_AIR这层索引作为列名 In[43]: fg_unstack = fg.unstack('ORG_AIR', fill_value=0) fg_unstack.head() out[43]: ``` ![](https://img.kancloud.cn/2c/cb/2ccbe2b1ecf50a40e5f748310d00fa3c_665x282.png) ```py # 判断两个方式是否等价 In[44]: fg_unstack = fg.unstack('ORG_AIR', fill_value=0) fp.equals(fg_unstack) out[44]: True ``` ### 更多 ```py # 先实现一个稍微复杂的透视表 In[45]: fp2 = flights.pivot_table(index=['AIRLINE', 'MONTH'], columns=['ORG_AIR', 'CANCELLED'], values=['DEP_DELAY', 'DIST'], aggfunc=[np.mean, np.sum], fill_value=0) fp2.head() out[45]: ``` ![](https://img.kancloud.cn/a7/f9/a7f99136e4ead5fcd59c26db1e87e144_1206x484.png) ```py # 用groupby和unstack复现上面的方法 In[46]: flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR', 'CANCELLED'])['DEP_DELAY', 'DIST'] \ .agg(['mean', 'sum']) \ .unstack(['ORG_AIR', 'CANCELLED'], fill_value=0) \ .swaplevel(0, 1, axis='columns') \ .head() out[46]: ``` ![](https://img.kancloud.cn/c1/15/c1159449d2aa7dac5ffa2a94d8ce7249_1206x476.png) ## 7\. 为了更容易reshaping，重新命名索引层 ```py # 读取college数据集，分组后，统计本科生的SAT数学成绩信息 In[47]: college = pd.read_csv('data/college.csv') In[48]: cg = college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATMTMID'] \ .agg(['count', 'min', 'max']).head(6) In[49]: cg out[49]: ``` ![](https://img.kancloud.cn/51/fc/51fc2948887d0d7ba9a3c4811918c85e_612x352.png) ```py # 行索引的两级都有名字，而列索引没有名字。用rename_axis给列索引的两级命名 In[50]:cg = cg.rename_axis(['AGG_COLS', 'AGG_FUNCS'], axis='columns') cg out[50]: ``` ![](https://img.kancloud.cn/c5/79/c579c7b000cd284c53bff5524159d135_638x359.png) ```py # 将AGG_FUNCS列移到行索引 In[51]:cg.stack('AGG_FUNCS').head() out[51]: ``` ![](https://img.kancloud.cn/cc/70/cc7078fcb6f70731fd86a1ccbd319cf4_530x279.png) ```py # stack默认是将列放到行索引的最内层，可以使用swaplevel改变层级 In[52]:cg.stack('AGG_FUNCS').swaplevel('AGG_FUNCS', 'STABBR', axis='index').head() out[52]: ``` ![](https://img.kancloud.cn/84/65/846587b93f07fc8b978501c4bdc21e35_556x282.png) ```py # 在此前的基础上再做sort_index In[53]:cg.stack('AGG_FUNCS') \ .swaplevel('AGG_FUNCS', 'STABBR', axis='index') \ .sort_index(level='RELAFFIL', axis='index') \ .sort_index(level='AGG_COLS', axis='columns').head(6) out[53]: ``` ![](https://img.kancloud.cn/ae/15/ae15dba31b740aea5ced72c2d10079bf_544x320.png) ```py # 对一些列做stack，对其它列做unstack In[54]:cg.stack('AGG_FUNCS').unstack(['RELAFFIL', 'STABBR']) out[54]: ``` ![](https://img.kancloud.cn/db/93/db93c936c5181df83dfb713fad141142_965x287.png) ```py # 对所有列做stack，会返回一个Series In[55]:cg.stack(['AGG_FUNCS', 'AGG_COLS']).head(12) out[55]: ``` ![](https://img.kancloud.cn/83/1a/831a3f05d0d1760c09785a1f82aa0ce7_546x362.png) ### 更多 ```py # 删除行和列索引所有层级的名称 In[56]:cg.rename_axis([None, None], axis='index').rename_axis([None, None], axis='columns') out[56]: ``` ![](https://img.kancloud.cn/93/1a/931a684e46b4a1272c8abcc56c99fdc2_483x316.png) ## 8\. 当多个变量被存储为列名时进行清理 ```py # 读取weightlifting数据集 In[57]:weightlifting = pd.read_csv('data/weightlifting_men.csv') weightlifting out[57]: ``` ![](https://img.kancloud.cn/7d/eb/7debef38679f340c6ca675c73bcec9a0_1198x383.png) ```py # 用melt方法，将sex_age放入一个单独的列 In[58]:wl_melt = weightlifting.melt(id_vars='Weight Category', var_name='sex_age', value_name='Qual Total') wl_melt.head() out[58]: ``` ![](https://img.kancloud.cn/a4/4a/a44a8a2e0d9d9629e1091cc42f8e7521_407x241.png) ```py # 用split方法将sex_age列分为两列 In[59]:sex_age = wl_melt['sex_age'].str.split(expand=True) sex_age.head() out[59]: 0 1 0 M35 35-39 1 M35 35-39 2 M35 35-39 3 M35 35-39 4 M35 35-39 ``` ```py # 给列起名 In[60]:sex_age.columns = ['Sex', 'Age Group'] sex_age.head() out[60]: ``` ![](https://img.kancloud.cn/bb/24/bb2479b2a448bb9ed002af2b60c9060b_209x236.png) ```py # 只取出字符串中的M In[61]:sex_age['Sex'] = sex_age['Sex'].str[0] sex_age.head() out[61]: ``` ![](https://img.kancloud.cn/0e/4d/0e4daf72e1a81703cbfb2efec389f8ce_196x240.png) ```py # 用concat方法，将sex_age,与wl_cat_total连接起来 In[62]:wl_cat_total = wl_melt[['Weight Category', 'Qual Total']] wl_tidy = pd.concat([sex_age, wl_cat_total], axis='columns') wl_tidy.head() out[62]: ``` ![](https://img.kancloud.cn/88/cd/88cdfc065d3f7b5807343136601cd15a_464x236.png) ```py # 上面的结果也可以如下实现 In[63]:cols = ['Weight Category', 'Qual Total'] sex_age[cols] = wl_melt[cols] ``` ### 更多 ```py # 也可以通过assign的方法，动态加载新的列 In[64]: age_group = wl_melt.sex_age.str.extract('(\d{2}[-+](?:\d{2})?)', expand=False) sex = wl_melt.sex_age.str[0] new_cols = {'Sex':sex, 'Age Group': age_group} In[65]: wl_tidy2 = wl_melt.assign(**new_cols).drop('sex_age', axis='columns') wl_tidy2.head() out[65]: ``` ![](https://img.kancloud.cn/e5/4e/e54edaddd485caa9afb30d6e83639e70_465x242.png) ```py # 判断两种方法是否等效 In[66]: wl_tidy2.sort_index(axis=1).equals(wl_tidy.sort_index(axis=1)) out[66]: True ``` ## 9\. 当多个变量被存储为列的值时进行清理 ```py # 读取restaurant_inspections数据集，将Date列的数据类型变为datetime64 In[67]: inspections = pd.read_csv('data/restaurant_inspections.csv', parse_dates=['Date']) inspections.head(10) out[67]: ``` ![](https://img.kancloud.cn/30/0e/300e7e9b0a18e3aad8464b0522b842c7_827x433.png) ```py # 用info列的所有值造一个新列。但是，Pandas不支持这种功能 In[68]: inspections.pivot(index=['Name', 'Date'], columns='Info', values='Value') --------------------------------------------------------------------------- ValueError Traceback (most recent call last) /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in __init__(self, values, categories, ordered, fastpath) 297 try: --> 298 codes, categories = factorize(values, sort=True) 299 except TypeError: /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/algorithms.py in factorize(values, sort, order, na_sentinel, size_hint) 559 check_nulls = not is_integer_dtype(original) --> 560 labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) 561 pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_labels (pandas/_libs/hashtable.c:21922)() ValueError: Buffer has wrong number of dimensions (expected 1, got 2) During handling of the above exception, another exception occurred: NotImplementedError Traceback (most recent call last) <ipython-input-68-754f69d68d6c> in <module>() ----> 1 inspections.pivot(index=['Name', 'Date'], columns='Info', values='Value') /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in pivot(self, index, columns, values) 3851 """ 3852 from pandas.core.reshape.reshape import pivot -> 3853 return pivot(self, index=index, columns=columns, values=values) 3854 3855 def stack(self, level=-1, dropna=True): /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in pivot(self, index, columns, values) 375 index = self[index] 376 indexed = Series(self[values].values, --> 377 index=MultiIndex.from_arrays([index, self[columns]])) 378 return indexed.unstack(columns) 379 /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/multi.py in from_arrays(cls, arrays, sortorder, names) 1098 from pandas.core.categorical import _factorize_from_iterables 1099 -> 1100 labels, levels = _factorize_from_iterables(arrays) 1101 if names is None: 1102 names = [getattr(arr, "name", None) for arr in arrays] /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in _factorize_from_iterables(iterables) 2191 # For consistency, it should return a list of 2 lists. 2192 return [[], []] -> 2193 return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables])) /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in <listcomp>(.0) 2191 # For consistency, it should return a list of 2 lists. 2192 return [[], []] -> 2193 return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables])) /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in _factorize_from_iterable(values) 2163 codes = values.codes 2164 else: -> 2165 cat = Categorical(values, ordered=True) 2166 categories = cat.categories 2167 codes = cat.codes /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/categorical.py in __init__(self, values, categories, ordered, fastpath) 308 309 # FIXME --> 310 raise NotImplementedError("> 1 ndim Categorical are not " 311 "supported at this time") 312 NotImplementedError: > 1 ndim Categorical are not supported at this time ``` ```py # 将'Name','Date', 'Info'作为所索引 In[69]: inspections.set_index(['Name','Date', 'Info']).head(10) out[69]: ``` ![](https://img.kancloud.cn/53/10/5310c130f42d6ff97d082dc7fe5b6d99_820x472.png) ```py # 用pivot，将info列中的值变为新的列 In[70]: inspections.set_index(['Name','Date', 'Info']).unstack('Info').head() out[70]: ``` ![](https://img.kancloud.cn/a4/16/a41616db7de3dddcda89bd9a70e56e4d_1184x421.png) ```py # 用reset_index方法，使行索引层级与列索引相同 In[71]: insp_tidy = inspections.set_index(['Name','Date', 'Info']) \ .unstack('Info') \ .reset_index(col_level=-1) insp_tidy.head() out[71]: ``` ![](https://img.kancloud.cn/0a/42/0a4287345af6e77cc8566fa2b21f4803_1189x389.png) ```py # 除掉列索引的最外层，重命名行索引的层为None In[72]: insp_tidy.columns = insp_tidy.columns.droplevel(0).rename(None) insp_tidy.head() out[72]: ``` ![](https://img.kancloud.cn/5b/75/5b75e3007553876056e3a0fc39ef168f_1194x360.png) ```py # 使用squeeze方法，可以避免前面的多级索引 In[73]: inspections.set_index(['Name','Date', 'Info']) \ .squeeze() \ .unstack('Info') \ .reset_index() \ .rename_axis(None, axis='columns') out[73]: ``` ![](https://img.kancloud.cn/e0/cb/e0cbc564bf493a8a44ee8230526f2880_1166x496.png) ### 更多 ```py # pivot_table需要传入聚合函数，才能产生一个单一值 In[74]: inspections.pivot_table(index=['Name', 'Date'], columns='Info', values='Value', aggfunc='first') \ .reset_index()\ .rename_axis(None, axis='columns') out[74]: ``` ![](https://img.kancloud.cn/84/34/84341bb1b09aad6158db297411bab954_1216x507.png) ## 10\. 当两个或多个值存储于一个单元格时进行清理 ```py # 读取texas_cities数据集 In[75]: cities = pd.read_csv('data/texas_cities.csv') cities out[75]: ``` ![](https://img.kancloud.cn/cd/b1/cdb170f26b1ea67a561d2d861b46c06f_336x165.png) ```py # 将Geolocation分解为四个单独的列 In[76]: geolocations = cities.Geolocation.str.split(pat='. ', expand=True) geolocations.columns = ['latitude', 'latitude direction', 'longitude', 'longitude direction'] geolocations out[76]: ``` ![](https://img.kancloud.cn/93/33/9333b05b8f8afc9b43b9ed7d2f79a659_561x162.png) ```py # 转变数据类型 In[77]: geolocations = geolocations.astype({'latitude':'float', 'longitude':'float'}) geolocations.dtypes out[77]: latitude float64 latitude direction object longitude float64 longitude direction object dtype: object ``` ```py # 将新列与原先的city列连起来 In[78]: cities_tidy = pd.concat([cities['City'], geolocations], axis='columns') cities_tidy out[78]: ``` ![](https://img.kancloud.cn/ff/81/ff810806994236d7ed893a93b0d736f9_649x163.png) ```py # 忽略，作者这里是写重复了 In[79]: pd.concat([cities['City'], geolocations], axis='columns') out[79]: ``` ![](https://img.kancloud.cn/cd/95/cd950b1609e853db6c569f19fdc9ddfa_648x166.png) ### 原理 ```py # 函数to_numeric可以将每列自动变为整数或浮点数 In[80]: temp = geolocations.apply(pd.to_numeric, errors='ignore') temp out[80]: ``` ![](https://img.kancloud.cn/02/c5/02c527f081e48f4ea1bb484745e37447_562x160.png) ```py # 再查看数据类型 In[81]: temp.dtypes out[81]: latitude float64 latitude direction object longitude float64 longitude direction object dtype: object ``` ### 更多 ```py # |符，可以对多个标记进行分割 In[82]: cities.Geolocation.str.split(pat='° |, ', expand=True) out[82]: ``` ![](https://img.kancloud.cn/4e/16/4e162bb550a9a156f8a4bd1f91732fe3_267x162.png) ```py # 更复杂的提取方式 In[83]: cities.Geolocation.str.extract('([0-9.]+). (N|S), ([0-9.]+). (E|W)', expand=True) out[83]: ``` ![](https://img.kancloud.cn/b6/0f/b60fb02444be4ec07423e617d0232e14_264x157.png) ## 11\. 当多个变量被存储为列名和列值时进行清理 ```py # 读取sensors数据集 In[84]: sensors = pd.read_csv('data/sensors.csv') sensors out[84]: ``` ![](https://img.kancloud.cn/73/12/7312571f8db7fd3abb3145e0b226670a_514x274.png) ```py # 用melt清理数据 In[85]: sensors.melt(id_vars=['Group', 'Property'], var_name='Year').head(6) out[85]: ``` ![](https://img.kancloud.cn/52/6e/526e4692fab321b65b6eb58276bd8fce_350x274.png) ```py # 用pivot_table，将Property列转化为新的列名 In[86]: sensors.melt(id_vars=['Group', 'Property'], var_name='Year') \ .pivot_table(index=['Group', 'Year'], columns='Property', values='value') \ .reset_index() \ .rename_axis(None, axis='columns') out[86]: ``` ![](https://img.kancloud.cn/ac/a8/aca856ec1881a598c764cdb08fcb7824_451x435.png) ### 更多 ```py # 用stack和unstack实现上述方法 In[87]: sensors.set_index(['Group', 'Property']) \ .stack() \ .unstack('Property') \ .rename_axis(['Group', 'Year'], axis='index') \ .rename_axis(None, axis='columns') \ .reset_index() out[87]: ``` ![](https://img.kancloud.cn/89/a5/89a5591ac0658e06562232e77d4d64a9_450x431.png) ## 12\. 当多个观察单位被存储于同一张表时进行清理 ```py # 读取movie_altered数据集 In[88]: movie = pd.read_csv('data/movie_altered.csv') movie.head() out[88]: ``` ![](https://img.kancloud.cn/23/f9/23f9c92c1ca06fa8ca34f73006e04457_1200x547.png) ```py # 插入新的列，用来标识每一部电影 In[89]: movie.insert(0, 'id', np.arange(len(movie))) movie.head() out[89]: ``` ![](https://img.kancloud.cn/5d/f3/5df395019a47eafdd967d5147c6e9c37_1206x549.png) ```py # 用wide_to_long，将所有演员放到一列，将所有Facebook likes放到一列 In[90]: stubnames = ['director', 'director_fb_likes', 'actor', 'actor_fb_likes'] movie_long = pd.wide_to_long(movie, stubnames=stubnames, i='id', j='num', sep='_').reset_index() movie_long['num'] = movie_long['num'].astype(int) movie_long.head(9) out[90]: ``` ![](https://img.kancloud.cn/12/61/126180161126234f8e7fd0d13db2dab5_1191x543.png) ```py # 将这个数据分解成多个小表 In[91]: movie_table = movie_long[['id','title', 'year', 'duration', 'rating']] director_table = movie_long[['id', 'director', 'num', 'director_fb_likes']] actor_table = movie_long[['id', 'actor', 'num', 'actor_fb_likes']] In[92]: movie_table.head(9) out[90]: ``` ![](https://img.kancloud.cn/02/78/02780466ea1557d343d4ff3a1e74fc3c_649x399.png) ```py In[93]: director_table.head(9) out[93]: ``` ![](https://img.kancloud.cn/00/b7/00b702ff3ac51f89c1bd87b268e6bb13_443x400.png) ```py In[94]: actor_table.head(9) out[94]: ``` ![](https://img.kancloud.cn/82/39/823934c0d3ccf42f63a0790e4a72f433_431x395.png) ```py # 做一些去重和去除缺失值的工作 In[95]: movie_table = movie_table.drop_duplicates().reset_index(drop=True) director_table = director_table.dropna().reset_index(drop=True) actor_table = actor_table.dropna().reset_index(drop=True) In[96]: movie_table.head() out[96]: ``` ![](https://img.kancloud.cn/f0/b9/f0b928e497789ae36b63fece217b7283_681x238.png) ```py In[97]: director_table.head() out[97]: ``` ![](https://img.kancloud.cn/a1/c9/a1c946e15e0896ffef7c8b836ad6d029_452x248.png) ```py # 比较内存的使用量 In[98]: movie.memory_usage(deep=True).sum() out[98]: 2318234 In[99]: movie_table.memory_usage(deep=True).sum() + \ director_table.memory_usage(deep=True).sum() + \ actor_table.memory_usage(deep=True).sum() out[99]: 2624898 ``` ```py # 创建演员和导演的id列 In[100]: director_cat = pd.Categorical(director_table['director']) director_table.insert(1, 'director_id', director_cat.codes) actor_cat = pd.Categorical(actor_table['actor']) actor_table.insert(1, 'actor_id', actor_cat.codes) director_table.head() out[100]: ``` ![](https://img.kancloud.cn/f1/50/f15043a522a910a3a414716575b49ce4_564x242.png) ```py In[101]: actor_table.head() out[101]: ``` ![](https://img.kancloud.cn/55/64/5564b64565a22e8912e25133cc6e59b2_519x245.png) ```py # 可以用这两张表生成要用的中间表。先来做director表 In[102]: director_associative = director_table[['id', 'director_id', 'num']] dcols = ['director_id', 'director', 'director_fb_likes'] director_unique = director_table[dcols].drop_duplicates().reset_index(drop=True) director_associative.head() out[102]: ``` ![](https://img.kancloud.cn/b9/c8/b9c8b02f92d9a75dda2a11d3a296fed1_244x235.png) ```py In[103]: director_unique.head() out[103]: ``` ![](https://img.kancloud.cn/4d/d3/4dd311b1a93c499df038b8891644db2c_474x243.png) ```py # 再来做actor表 In[104]: actor_associative = actor_table[['id', 'actor_id', 'num']] acols = ['actor_id', 'actor', 'actor_fb_likes'] actor_unique = actor_table[acols].drop_duplicates().reset_index(drop=True) actor_associative.head() out[104]: ``` ![](https://img.kancloud.cn/2d/21/2d213b7a380b505abb93afd90120901b_218x237.png) ```py In[105]: actor_unique.head() out[105]: ``` ![](https://img.kancloud.cn/75/22/75228ba919e14ad9ae0c6bd2ea6ea3da_426x242.png) ```py # 查看新的表所使用的内存量 In[106]: movie_table.memory_usage(deep=True).sum() + \ director_associative.memory_usage(deep=True).sum() + \ director_unique.memory_usage(deep=True).sum() + \ actor_associative.memory_usage(deep=True).sum() + \ actor_unique.memory_usage(deep=True).sum() out[106]: 1833402 ``` ```py In[107]: movie_table.head() out[107]: ``` ![](https://img.kancloud.cn/32/45/3245a155b95dd3c5b779b20d45597462_683x242.png) ```py # 可以通过将左右表组合起来形成movie表。首先将附表与actor/director表结合，然后将num列pivot，再加上列的前缀 In[108]: actors = actor_associative.merge(actor_unique, on='actor_id') \ .drop('actor_id', 1) \ .pivot_table(index='id', columns='num', aggfunc='first') actors.columns = actors.columns.get_level_values(0) + '_' + \ actors.columns.get_level_values(1).astype(str) directors = director_associative.merge(director_unique, on='director_id') \ .drop('director_id', 1) \ .pivot_table(index='id', columns='num', aggfunc='first') directors.columns = directors.columns.get_level_values(0) + '_' + \ directors.columns.get_level_values(1).astype(str) In[109]: actors.head() out[109]: ``` ![](https://img.kancloud.cn/ba/69/ba6930a282a0a4fa2bb43f776234b39a_1015x288.png) ```py In[110]: directors.head() out[110]: ``` ![](https://img.kancloud.cn/d8/37/d8377d3c8789ca1e48def2dca4fe3c0d_387x279.png) ```py In[111]: movie2 = movie_table.merge(directors.reset_index(), on='id', how='left') \ .merge(actors.reset_index(), on='id', how='left') In[112]: movie2.head() out[112]: ``` ![](https://img.kancloud.cn/77/48/7748e7846f8165122171f5e394810414_1210x552.png) ```py In[113]: movie.equals(movie2[movie.columns]) out[113]: True ```