groupby · 机器学习

[TOC] # 准备数据 ~~~ import pandas as pd frame = pd.DataFrame( {'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', ], 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]}) print(frame) ~~~ 输出 ~~~ data key 0 0 A 1 5 B 2 10 C 3 5 A 4 10 B 5 15 C 6 10 A 7 15 B 8 20 C ~~~ # 求和求A,B,C品牌,销售额都是多少传统做法 ~~~ import pandas as pd frame = pd.DataFrame( {'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', ], 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]}) for key in ['A', 'B', 'C']: print(key, frame[frame['key'] == key].sum()) ~~~ 分组 ~~~ import pandas as pd frame = pd.DataFrame( {'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', ], 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]}) key__sum = frame.groupby('key').sum() print(key__sum) ~~~ 输出 ~~~ data key A 15 B 30 C 45 ~~~ # 求平均值这3个品牌的平均值 ~~~ import pandas as pd import numpy as np frame = pd.DataFrame( {'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', ], 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]}) aggregate = frame.groupby('key').aggregate(np.mean) print(aggregate) ~~~ 输出 ~~~ data key A 5 B 10 C 15 ~~~ **按性别分组求年龄的平均值** ~~~ import pandas as pd import numpy as np csv = pd.read_csv('./titanic.csv') mean = csv.groupby('Sex')['Age'].mean() print(mean) ~~~ 输出 ~~~ Sex female 27.915709 male 30.726645 Name: Age, dtype: float64 ~~~ # 按某个分组统计 ~~~ import pandas as pd import numpy as np df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) # 按A分组统计 groupby = df.groupby('A') print(groupby.count()) ~~~ 输出 ~~~ B C D A bar 3 3 3 foo 5 5 5 ~~~ 也可以指定多个字段 ~~~ import pandas as pd import numpy as np df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) # 按A分组统计 groupby = df.groupby(['A', 'B']) print(groupby.count()) ~~~ 输出 ~~~ C D A B bar one 1 1 three 1 1 two 1 1 foo one 2 2 three 1 1 two 2 2 ~~~ **传进来的字母小写在...里面就a,否则就b** ~~~ def get_letter_type(letter): if letter.lower() in 'aeiou': return 'a' else: return 'b' grouped = df.groupby(get_letter_type,axis = 1) grouped.count().iloc[0] ~~~ 输出 ~~~ a 1 b 3 Name: 0, dtype: int64 ~~~ # 打印重复值实际上是做了排序 ~~~ s = pd.Series([1, 2, 3, 1, 2, 3], [8, 7, 5, 8, 7, 5]) grouped = s.groupby(level=0) print(s) print(grouped.first()) print(grouped.last()) ~~~ 输出 ~~~ 8 1 7 2 5 3 8 1 7 2 5 3 dtype: int64 5 3 7 2 8 1 dtype: int64 5 3 7 2 8 1 dtype: int64 ~~~ # 求和 ~~~ import pandas as pd import numpy as np s = pd.Series([1, 2, 3, 1, 2, 3], [8, 7, 5, 8, 7, 5]) grouped = s.groupby(level=0) print(grouped.sum()) ~~~ # 排序 ~~~ s = pd.Series([1, 2, 3, 1, 2, 3], [8, 7, 5, 8, 7, 5]) grouped = s.groupby(level=0, sort=True) print(grouped.first()) ~~~ # 分组后关注某个值 ~~~ df2 = pd.DataFrame({'x': ['A', 'B', 'A', 'B'], 'Y': [1, 2, 3, 4]}) print(df2.groupby(['x']).get_group('A')) ~~~ 输出 ~~~ x Y 0 A 1 2 A 3 ~~~ # 构造多重索引 ~~~ arrays = [ ['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'] ] # 设置2个索引 index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) print(index) s = pd.Series(np.random.randn(8), index=index) print(s) ~~~ 输出 ~~~ MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']], codes=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]], names=['first', 'second']) first second bar one 0.613790 two 1.432133 baz one 1.194075 two 1.863554 foo one -0.742547 two 0.228419 qux one 0.065728 two -2.212572 dtype: float64 ~~~ 取第一列求和 ~~~ # level = 'first'这样写也可以 groupby = s.groupby(level=0) print(groupby.sum()) ~~~ # 以某些列为键,求和顺便指定索引 ~~~ df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) grouped = df.groupby(['A', 'B']) aggregate = grouped.aggregate(np.sum) print(aggregate) ~~~ 输出 ~~~ C D A B bar one 0.832853 1.304246 three 1.510920 -0.056116 two 1.096638 -0.830680 foo one 0.642804 0.215040 three 0.228107 -1.037871 two -2.145560 0.015041 ~~~ 设置不指定索引 ~~~ grouped = df.groupby(['A', 'B'], as_index=False) aggregate = grouped.aggregate(np.sum) print(aggregate) ~~~ 输出 ~~~ A B C D 0 bar one 0.659134 0.035910 1 bar three 0.873216 -0.432892 2 bar two -0.054760 -0.634816 3 foo one -0.662632 0.543061 4 foo three 0.043405 -1.381792 5 foo two -0.561851 -1.262610 ~~~ # 重新构建索引 ~~~ grouped = df.groupby(['A', 'B']).sum().reset_index() print(grouped) ~~~ 输出 ~~~ A B C D 0 bar one -1.154648 -0.809538 1 bar three -0.623831 1.076197 2 bar two -0.496645 0.879416 3 foo one -0.439856 -0.279243 4 foo three 0.688289 -1.738350 5 foo two 0.832524 -0.520591 ~~~ # 查看出现的次数 ~~~ grouped = df.groupby(['A', 'B']) print(grouped.size()) ~~~ 输出 ~~~ A B bar one 1 three 1 two 1 foo one 2 three 1 two 2 dtype: int64 ~~~ # 查看统计信息 ~~~ grouped = df.groupby(['A', 'B']) print(grouped.describe()) ~~~ 输出 ~~~ C ... D count mean std ... 50% 75% max A B ... bar one 1.0 -1.105223 NaN ... -2.206875 -2.206875 -2.206875 three 1.0 0.585855 NaN ... -0.411104 -0.411104 -0.411104 two 1.0 -0.524709 NaN ... 0.516504 0.516504 0.516504 foo one 2.0 -0.223097 0.579265 ... -0.257382 -0.191427 -0.125473 three 1.0 -0.369408 NaN ... -1.330020 -1.330020 -1.330020 two 2.0 -1.060815 0.384927 ... 0.482748 0.933142 1.383536 ~~~ # 查看部分统计信息以A为分组,统计里面C的信息 ~~~ grouped = df.groupby('A') print(grouped['C'].agg([np.sum, np.mean, np.std])) ~~~ 输出 ~~~ sum mean std A bar -2.574704 -0.858235 0.975134 foo 0.429849 0.085970 0.901520 ~~~ 指定别名 ~~~ grouped = df.groupby('A') print(grouped['C'].agg({'res_sum': np.sum, 'res_mean': np.mean, 'res_std': np.std})) ~~~