# 第02章 DataFrame基本操作
```py
In[1]: import pandas as pd
import numpy as np
pd.options.display.max_columns = 40
```
## 1\. 选取多个DataFrame列
```py
# 用列表选取多个列
In[2]: movie = pd.read_csv('data/movie.csv')
movie_actor_director = movie[['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']]
movie_actor_director.head()
Out[2]:
```
![](https://img.kancloud.cn/99/73/9973590ec6e7ba26a29c4f64b981ab9b_1868x688.png)
```py
# 选取单列
In[3]: movie[['director_name']].head()
Out[3]:
```
![](https://img.kancloud.cn/4e/2a/4e2a5dc1aa0b6a9796443140f1a6a015_548x672.png)
```py
# 错误的选取多列的方式
In[4]: movie['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5280)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20523)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20477)()
KeyError: ('actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name')
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-4-954222273e42> in <module>()
----> 1 movie['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5280)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20523)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20477)()
KeyError: ('actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name')
```
### 更多
```py
# 将列表赋值给一个变量,便于多选
In[6]: cols =['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']
movie_actor_director = movie[cols]
Out[6]: float64 13
int64 3
object 11
dtype: int64
```
```py
# 使用select_dtypes(),选取整数列
In[7]: movie.select_dtypes(include=['int']).head()
Out[7]:
```
![](https://img.kancloud.cn/f6/52/f65291dc410eb13bbb7939dce6456d86_1370x396.png)
```py
# 选取所有的数值列
In[8]: movie.select_dtypes(include=['number']).head()
Out[8]:
```
![](https://img.kancloud.cn/36/ed/36ed822d14deec88a26de2729e7ee604_4004x1472.png)
```py
# 通过filter()函数过滤选取多列
In[9]: movie.filter(like='facebook').head()
Out[9]:
```
![](https://img.kancloud.cn/09/5b/095b7124c09a9d1062f5a80285b98207_4004x1144.png)
```py
# 通过正则表达式选取多列
In[10]: movie.filter(regex='\d').head()
Out[10]:
```
![](https://img.kancloud.cn/79/ba/79ba57655d2b2b2f6502ca14d854d943_1998x524.png)
```py
# filter()函数,传递列表到参数items,选取多列
In[11]: movie.filter(items=['actor_1_name', 'asdf']).head()
Out[11]:
```
![](https://img.kancloud.cn/34/94/3494f066664247a67298d6c1433464db_1480x772.png)
## 2\. 对列名进行排序
```py
# 读取movie数据集
In[12]: movie = pd.read_csv('data/movie.csv')
In[13]: movie.head()
Out[13]:
```
![](https://img.kancloud.cn/5c/a4/5ca4628b119f2b9ac2fadc2900c23974_4000x1316.png)
```py
# 打印列索引
In[14]: movie.columns
Out[14]: Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
dtype='object')
```
```py
# 将列索引按照指定的顺序排列
In[15]: disc_core = ['movie_title','title_year', 'content_rating','genres']
disc_people = ['director_name','actor_1_name', 'actor_2_name','actor_3_name']
disc_other = ['color','country','language','plot_keywords','movie_imdb_link']
cont_fb = ['director_facebook_likes','actor_1_facebook_likes','actor_2_facebook_likes',
'actor_3_facebook_likes', 'cast_total_facebook_likes', 'movie_facebook_likes']
cont_finance = ['budget','gross']
cont_num_reviews = ['num_voted_users','num_user_for_reviews', 'num_critic_for_reviews']
cont_other = ['imdb_score','duration', 'aspect_ratio', 'facenumber_in_poster']
In[16]: new_col_order = disc_core + disc_people + disc_other + \
cont_fb + cont_finance + cont_num_reviews + cont_other
set(movie.columns) == set(new_col_order)
Out[16]: True
In[17]: movie2 = movie[new_col_order]
movie2.head()
Out[17]:
```
![](https://img.kancloud.cn/f8/28/f82817de45c270a7100e4991a5cc077d_4000x1308.png)
## 3\. 在整个DataFrame上操作
```py
In[18]: pd.options.display.max_rows = 8
movie = pd.read_csv('data/movie.csv')
# 打印行数和列数
movie.shape
Out[18]: (4916, 28)
```
```py
# 打印数据的个数
In[19]: movie.size
Out[19]: 137648
```
```py
# 该数据集的维度
In[20]: movie.ndim
Out[20]: 2
```
```py
# 该数据集的长度
In[21]: len(movie)
Out[21]: 4916
```
```py
# 各个列的值的个数
In[22]: movie.count()
Out[22]: color 4897
director_name 4814
num_critic_for_reviews 4867
duration 4901
...
actor_2_facebook_likes 4903
imdb_score 4916
aspect_ratio 4590
movie_facebook_likes 4916
Length: 28, dtype: int64
```
```py
# 各列的最小值
In[23]: movie.min()
Out[23]: num_critic_for_reviews 1.00
duration 7.00
director_facebook_likes 0.00
actor_3_facebook_likes 0.00
...
actor_2_facebook_likes 0.00
imdb_score 1.60
aspect_ratio 1.18
movie_facebook_likes 0.00
Length: 16, dtype: float64
```
```py
# 打印描述信息
In[24]: movie.describe()
Out[24]:
```
![](https://img.kancloud.cn/ee/b1/eeb113c085436f1a7c8a1683ea8e7988_2012x538.png)
```py
# 使用percentiles参数指定分位数
In[25]: pd.options.display.max_rows = 10
In[26]: movie.describe(percentiles=[.01, .3, .99])
Out[26]:
```
![](https://img.kancloud.cn/b0/dd/b0dd6101b9b2bc9212d6b9d97fc27904_2000x552.png)
```py
# 打印各列空值的个数
In[27]: pd.options.display.max_rows = 8
In[28]: movie.isnull().sum()
Out[28]: color 19
director_name 102
num_critic_for_reviews 49
duration 15
...
actor_2_facebook_likes 13
imdb_score 0
aspect_ratio 326
movie_facebook_likes 0
Length: 28, dtype: int64
```
### 更多
```py
# 设定skipna=False,没有缺失值的数值列才会计算结果
In[29]: movie.min(skipna=False)
Out[29]: num_critic_for_reviews NaN
duration NaN
director_facebook_likes NaN
actor_3_facebook_likes NaN
...
actor_2_facebook_likes NaN
imdb_score 1.6
aspect_ratio NaN
movie_facebook_likes 0.0
Length: 16, dtype: float64
```
## 4\. 串联DataFrame方法
```py
# 使用isnull方法将每个值转变为布尔值
In[30]: movie = pd.read_csv('data/movie.csv')
movie.isnull().head()
Out[30]:
```
![](https://img.kancloud.cn/36/18/36182d2129fe9db59777de5fc89a204b_2008x360.png)
```py
# 使用sum统计布尔值,返回的是Series
In[31]: movie.isnull().sum().head()
Out[31]: color 19
director_name 102
num_critic_for_reviews 49
duration 15
director_facebook_likes 102
dtype: int64
```
```py
# 对这个Series再使用sum,返回整个DataFrame的缺失值的个数,返回值是个标量
In[32]: movie.isnull().sum().sum()
Out[32]: 2654
```
```py
# 判断整个DataFrame有没有缺失值,方法是连着使用两个any
In[33]: movie.isnull().any().any()
Out[33]: True
```
### 原理
```py
# isnull返回同样大小的DataFrame,但所有的值变为布尔值
In[34]: movie.isnull().get_dtype_counts()
Out[34]: bool 28
dtype: int64
```
### 更多
```py
# movie数据集的对象数据包含缺失值。默认条件下,聚合方法min、max、sum,不会返回任何值。
In[35]: movie[['color', 'movie_title', 'color']].max()
Out[35]: Series([], dtype: float64)
```
```py
# 要让pandas强行返回每列的值,必须填入缺失值。下面填入的是空字符串:
In[36]: movie.select_dtypes(['object']).fillna('').max()
Out[36]: color Color
director_name Étienne Faure
actor_2_name Zubaida Sahar
genres Western
...
movie_imdb_link [http://www.imdb.com/title/tt5574490/?ref_=fn_t...](http://www.imdb.com/title/tt5574490/?ref_=fn_t...)
language Zulu
country West Germany
content_rating X
Length: 12, dtype: object</pre>
```
## 5\. 在DataFrame上使用运算符
```py
# college数据集的值既有数值也有对象,整数5不能与字符串相加
In[37]: college = pd.read_csv('data/college.csv')
college + 5
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
1175 result = expressions.evaluate(op, str_rep, x, y,
-> 1176 raise_on_error=True, **eval_kwargs)
1177 except TypeError:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/computation/expressions.py in evaluate(op, op_str, a, b, raise_on_error, use_numexpr, **eval_kwargs)
210 return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error,
--> 211 **eval_kwargs)
212 return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b, raise_on_error, truediv, reversed, **eval_kwargs)
121 if result is None:
--> 122 result = _evaluate_standard(op, op_str, a, b, raise_on_error)
123
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/computation/expressions.py in _evaluate_standard(op, op_str, a, b, raise_on_error, **eval_kwargs)
63 with np.errstate(all='ignore'):
---> 64 return op(a, b)
65
TypeError: must be str, not int
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in eval(self, func, other, raise_on_error, try_cast, mgr)
1183 with np.errstate(all='ignore'):
-> 1184 result = get_result(other)
1185
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in get_result(other)
1152 else:
-> 1153 result = func(values, other)
1154
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
1201 with np.errstate(all='ignore'):
-> 1202 result[mask] = op(xrav, y)
1203 else:
TypeError: must be str, not int
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-37-4749f68a2501> in <module>()
1 college = pd.read_csv('data/college.csv')
----> 2 college + 5
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in f(self, other, axis, level, fill_value)
1239 self = self.fillna(fill_value)
1240
-> 1241 return self._combine_const(other, na_op)
1242
1243 f.__name__ = name
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _combine_const(self, other, func, raise_on_error)
3541 def _combine_const(self, other, func, raise_on_error=True):
3542 new_data = self._data.eval(func=func, other=other,
-> 3543 raise_on_error=raise_on_error)
3544 return self._constructor(new_data)
3545
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in eval(self, **kwargs)
3195
3196 def eval(self, **kwargs):
-> 3197 return self.apply('eval', **kwargs)
3198
3199 def quantile(self, **kwargs):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3089
3090 kwargs['mgr'] = self
-> 3091 applied = getattr(b, f)(**kwargs)
3092 result_blocks = _extend_blocks(applied, result_blocks)
3093
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in eval(self, func, other, raise_on_error, try_cast, mgr)
1189 raise
1190 except Exception as detail:
-> 1191 result = handle_error()
1192
1193 # technically a broadcast error in numpy can 'work' by returning a
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in handle_error()
1172 # The 'detail' variable is defined in outer scope.
1173 raise TypeError('Could not operate %s with block values %s' %
-> 1174 (repr(other), str(detail))) # noqa
1175 else:
1176 # return the values
TypeError: Could not operate 5 with block values must be str, not int
```
```py
# 行索引名设为INSTNM,用UGDS_过滤出本科生的种族比例
In[38]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')
In[39]: college == 'asdf' # 这是jn上的,想要比较college和‘asdf’,没有意义,忽略
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-39-697c8af60bcf> in <module>()
----> 1 college == 'asdf'
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in f(self, other)
1302 # straight boolean comparisions we want to allow all columns
1303 # (regardless of dtype to pass thru) See #4537 for discussion.
-> 1304 res = self._combine_const(other, func, raise_on_error=False)
1305 return res.fillna(True).astype(bool)
1306
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _combine_const(self, other, func, raise_on_error)
3541 def _combine_const(self, other, func, raise_on_error=True):
3542 new_data = self._data.eval(func=func, other=other,
-> 3543 raise_on_error=raise_on_error)
3544 return self._constructor(new_data)
3545
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in eval(self, **kwargs)
3195
3196 def eval(self, **kwargs):
-> 3197 return self.apply('eval', **kwargs)
3198
3199 def quantile(self, **kwargs):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3089
3090 kwargs['mgr'] = self
-> 3091 applied = getattr(b, f)(**kwargs)
3092 result_blocks = _extend_blocks(applied, result_blocks)
3093
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in eval(self, func, other, raise_on_error, try_cast, mgr)
1203
1204 raise TypeError('Could not compare [%s] with block values' %
-> 1205 repr(other))
1206
1207 # transpose if needed
TypeError: Could not compare ['asdf'] with block values
```
```py
# 查看前5行
In[40]: college_ugds_.head()
Out[40]:
```
![](https://img.kancloud.cn/7b/3a/7b3adde06f0db25914c37e99d7e123a3_1082x503.png)
```py
# 现在都是均质数据了,可以进行数值运算
In[41]: college_ugds_.head() + .00501
Out[41]:
```
![](https://img.kancloud.cn/c7/de/c7defa746ffee09ad4b2c292adc3dd76_952x550.png)
```py
# 用底除计算百分比分数
In[42]: (college_ugds_.head() + .00501) // .01
Out[42]:
```
![](https://img.kancloud.cn/7b/05/7b05f05bf56bf1e81961e9b56d2af24d_958x554.png)
```py
# 再除以100
In[43]: college_ugds_op_round = (college_ugds_ + .00501) // .01 / 100
college_ugds_op_round.head()
Out[43]:
```
![](https://img.kancloud.cn/3d/f8/3df8630f13cf3de858ffa4d4edd72ed9_954x546.png)
```py
# 保留两位小数
In[44]: college_ugds_round = (college_ugds_ + .00001).round(2)
college_ugds_round.head()
Out[44]:
```
![](https://img.kancloud.cn/35/ab/35ab604a35310eaa556d6a0d7f579a94_958x545.png)
```py
In[45]: .045 + .005
Out[45]: 0.049999999999999996
```
```py
In[46]: college_ugds_op_round.equals(college_ugds_round)
Out[46]: True
```
### 更多
```py
# DataFrame的通用函数也可以实现上述方法
In[47]: college_ugds_op_round_methods = college_ugds_.add(.00501).floordiv(.01).div(100)
```
## 6\. 比较缺失值
```py
# Pandas使用NumPy NaN(np.nan)对象表示缺失值。这是一个不等于自身的特殊对象:
In[48]: np.nan == np.nan
Out[48]: False
```
```py
# Python的None对象是等于自身的
In[49]: None == None
Out[49]: True
```
```py
# 所有和np.nan的比较都返回False,除了不等于:
In[50]: 5 > np.nan
Out[50]: False
In[51]: np.nan > 5
Out[51]: False
In[52]: 5 != np.nan
Out[52]: True
```
```py
# college_ugds_所有值和.0019比较,返回布尔值DataFrame
In[53]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')
In[54]: college_ugds_.head() == .0019
Out[54]:
```
![](https://img.kancloud.cn/31/4e/314e071475103930e83ee509f7ea39a5_957x545.png)
```py
# 用DataFrame和DataFrame进行比较
In[55]: college_self_compare = college_ugds_ == college_ugds_
college_self_compare.head()
Out[55]:
```
![](https://img.kancloud.cn/aa/a6/aaa616bab21374b5d6e38b91a28869f9_949x495.png)
```py
# 用all()检查是否所有的值都是True;这是因为缺失值不互相等于。
In[56]: college_self_compare.all()
Out[56]: UGDS_WHITE False
UGDS_BLACK False
UGDS_HISP False
UGDS_ASIAN False
...
UGDS_NHPI False
UGDS_2MOR False
UGDS_NRA False
UGDS_UNKN False
Length: 9, dtype: bool
```
```py
# 可以用==号判断,然后求和
In[57]: (college_ugds_ == np.nan).sum()
Out[57]: UGDS_WHITE 0
UGDS_BLACK 0
UGDS_HISP 0
UGDS_ASIAN 0
..
UGDS_NHPI 0
UGDS_2MOR 0
UGDS_NRA 0
UGDS_UNKN 0
Length: 9, dtype: int64
```
```py
# 统计缺失值最主要方法是使用isnull方法:
In[58]: college_ugds_.isnull().sum()
Out[58]: UGDS_WHITE 661
UGDS_BLACK 661
UGDS_HISP 661
UGDS_ASIAN 661
...
UGDS_NHPI 661
UGDS_2MOR 661
UGDS_NRA 661
UGDS_UNKN 661
Length: 9, dtype: int64
```
```py
# 比较两个DataFrame最直接的方法是使用equals()方法
In[59]: from pandas.testing import assert_frame_equal
In[60]: assert_frame_equal(college_ugds_, college_ugds_)
Out[60]: True
```
### 更多
```py
# eq()方法类似于==,和前面的equals有所不同
In[61]: college_ugds_.eq(.0019).head()
Out[61]:
```
![](https://img.kancloud.cn/10/f0/10f0fa7a91ad4a28a378bb33135ed59e_959x551.png)
## 7\. 矩阵转置
```py
In[62]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')
college_ugds_.head()
Out[62]:
```
![](https://img.kancloud.cn/a7/25/a725060f66ebce866329b4c5959ae397_954x543.png)
```py
# count()返回非缺失值的个数
In[63]: college_ugds_.count()
Out[63]: UGDS_WHITE 6874
UGDS_BLACK 6874
UGDS_HISP 6874
UGDS_ASIAN 6874
...
UGDS_NHPI 6874
UGDS_2MOR 6874
UGDS_NRA 6874
UGDS_UNKN 6874
Length: 9, dtype: int64
```
```py
# axis默认设为0
In[64]: college_ugds_.count(axis=0)
Out[64]: UGDS_WHITE 6874
UGDS_BLACK 6874
UGDS_HISP 6874
UGDS_ASIAN 6874
...
UGDS_NHPI 6874
UGDS_2MOR 6874
UGDS_NRA 6874
UGDS_UNKN 6874
Length: 9, dtype: int64
```
```py
# 等价于axis='index'
In[65]: college_ugds_.count(axis='index')
Out[65]: UGDS_WHITE 6874
UGDS_BLACK 6874
UGDS_HISP 6874
UGDS_ASIAN 6874
...
UGDS_NHPI 6874
UGDS_2MOR 6874
UGDS_NRA 6874
UGDS_UNKN 6874
Length: 9, dtype: int64
```
```py
# 统计每行的非缺失值个数
In[66]: college_ugds_.count(axis='columns').head()
Out[66]: INSTNM
Alabama A & M University 9
University of Alabama at Birmingham 9
Amridge University 9
University of Alabama in Huntsville 9
Alabama State University 9
dtype: int64
```
```py
# 除了统计每行的非缺失值个数,也可以求和加以确认
In[67]: college_ugds_.sum(axis='columns').head()
Out[67]: INSTNM
Alabama A & M University 1.0000
University of Alabama at Birmingham 0.9999
Amridge University 1.0000
University of Alabama in Huntsville 1.0000
Alabama State University 1.0000
dtype: float64
```
```py
# 用中位数了解每列的分布
In[68]: college_ugds_.median(axis='index')
Out[68]: UGDS_WHITE 0.55570
UGDS_BLACK 0.10005
UGDS_HISP 0.07140
UGDS_ASIAN 0.01290
...
UGDS_NHPI 0.00000
UGDS_2MOR 0.01750
UGDS_NRA 0.00000
UGDS_UNKN 0.01430
Length: 9, dtype: float64
```
### 更多
```py
# 使用累积求和cumsum()可以很容易看到白人、黑人、西班牙裔的比例
In[69]: college_ugds_cumsum = college_ugds_.cumsum(axis=1)
college_ugds_cumsum.head()
Out[69]:
```
![](https://img.kancloud.cn/a1/9e/a19e7170bb62e7e0b864da7ffe278fe2_958x547.png)
```py
# UGDS_HISP一列降序排列
In[70]: college_ugds_cumsum.sort_values('UGDS_HISP', ascending=False)
Out[70]:
```
![](https://img.kancloud.cn/3a/ce/3ace483a3d30b97ff3bc4e91731caece_953x1026.png)
## 8\. 确定大学校园多样性
```py
# US News给出的美国10所最具多样性的大学
In[71]: pd.read_csv('data/college_diversity.csv', index_col='School')
Out[71]:
```
![](https://img.kancloud.cn/1f/39/1f39dbac265dcae5406979c21a69e8d6_607x490.png)
```py
In[72]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')
college_ugds_.head()
Out[72]:
```
![](https://img.kancloud.cn/e3/75/e3750c301c19a65431e0e9304fe1b6a8_958x551.png)
```py
In[73]: college_ugds_.isnull().sum(axis=1).sort_values(ascending=False).head()
Out[73]: INSTNM
Excel Learning Center-San Antonio South 9
Philadelphia College of Osteopathic Medicine 9
Assemblies of God Theological Seminary 9
Episcopal Divinity School 9
Phillips Graduate Institute 9
dtype: int64
```
```py
# 如果所有列都是缺失值,则将其去除
In[74]: college_ugds_ = college_ugds_.dropna(how='all')
In[75]: college_ugds_.isnull().sum()
Out[75]: UGDS_WHITE 0
UGDS_BLACK 0
UGDS_HISP 0
UGDS_ASIAN 0
..
UGDS_NHPI 0
UGDS_2MOR 0
UGDS_NRA 0
UGDS_UNKN 0
Length: 9, dtype: int64
```
```py
# 用大于或等于方法ge(),将DataFrame变为布尔值矩阵
In[76]: college_ugds_.ge(.15).head()
Out[76]:
```
![](https://img.kancloud.cn/62/e8/62e8f9f8c8def372018f122e42b4bc33_953x542.png)
```py
# 对所有True值求和
In[77]: diversity_metric = college_ugds_.ge(.15).sum(axis='columns')
diversity_metric.head()
Out[77]: INSTNM
Alabama A & M University 1
University of Alabama at Birmingham 2
Amridge University 3
University of Alabama in Huntsville 1
Alabama State University 1
dtype: int64
```
```py
# 使用value_counts(),查看分布情况
In[78]: diversity_metric.value_counts()
Out[78]: 1 3042
2 2884
3 876
4 63
0 7
5 2
dtype: int64
```
```py
# 查看哪些学校种群比例超过15%的数量多
In[79]: diversity_metric.sort_values(ascending=False).head()
Out[79]: INSTNM
Regency Beauty Institute-Austin 5
Central Texas Beauty College-Temple 5
Sullivan and Cogliano Training Center 4
Ambria College of Nursing 4
Berkeley College-New York 4
dtype: int64
```
```py
# 用loc()方法查看对应行索引的行
In[80]: college_ugds_.loc[['Regency Beauty Institute-Austin',
'Central Texas Beauty College-Temple']]
Out[80]:
```
![](https://img.kancloud.cn/fc/c9/fcc9f6033f9c2e9a48590dc4de5d3477_954x350.png)
```py
# 查看US News前五所最具多样性的大学在diversity_metric中的情况
In[81]: us_news_top = ['Rutgers University-Newark',
'Andrews University',
'Stanford University',
'University of Houston',
'University of Nevada-Las Vegas']
In[82]: diversity_metric.loc[us_news_top]
Out[82]: INSTNM
Rutgers University-Newark 4
Andrews University 3
Stanford University 3
University of Houston 3
University of Nevada-Las Vegas 3
dtype: int64
```
### 更多
```py
# 可以用最大种群比例查看哪些学校最不具有多样性
In[83]: college_ugds_.max(axis=1).sort_values(ascending=False).head(10)
Out[83]: INSTNM
Dewey University-Manati 1.0
Yeshiva and Kollel Harbotzas Torah 1.0
Mr Leon's School of Hair Design-Lewiston 1.0
Dewey University-Bayamon 1.0
...
Monteclaro Escuela de Hoteleria y Artes Culinarias 1.0
Yeshiva Shaar Hatorah 1.0
Bais Medrash Elyon 1.0
Yeshiva of Nitra Rabbinical College 1.0
Length: 10, dtype: float64
```
```py
# 查看Talmudical Seminary Oholei Torah哲学学校
In[84]: college_ugds_.loc['Talmudical Seminary Oholei Torah']
Out[84]: UGDS_WHITE 1.0
UGDS_BLACK 0.0
UGDS_HISP 0.0
UGDS_ASIAN 0.0
...
UGDS_NHPI 0.0
UGDS_2MOR 0.0
UGDS_NRA 0.0
UGDS_UNKN 0.0
Name: Talmudical Seminary Oholei Torah, Length: 9, dtype: float64
```
```py
# 查看是否有学校九个种族的比例都超过了1%
In[85]: (college_ugds_ > .01).all(axis=1).any()
Out[85]: True
```