# 二十、数据可视化
> 作者:[Chris Albon](https://chrisalbon.com/)
>
> 译者:[飞龙](https://github.com/wizardforcel)
>
> 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
## MatPlotLib 中的双向条形图
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 创建数据帧
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'pre_score': [4, 24, 31, 2, 3],
'mid_score': [25, 94, 57, 62, 70],
'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df
```
| | first_name | pre_score | mid_score | post_score |
| --- | --- | --- | --- | --- |
| 0 | Jason | 4 | 25 | 5 |
| 1 | Molly | 24 | 94 | 43 |
| 2 | Tina | 31 | 57 | 23 |
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```py
# 输入数据,特别是第二和
# 第三行,跳过第一列
x1 = df.ix[1, 1:]
x2 = df.ix[2, 1:]
# 创建条形标签
bar_labels = ['Pre Score', 'Mid Score', 'Post Score']
# 创建图形
fig = plt.figure(figsize=(8,6))
# 设置 y 的位置
y_pos = np.arange(len(x1))
y_pos = [x for x in y_pos]
plt.yticks(y_pos, bar_labels, fontsize=10)
# 在 y_pos 的位置上创建水平条形
plt.barh(y_pos,
# 使用数据 x1
x1,
# 中心对齐
align='center',
# 透明度为 0.4
alpha=0.4,
# 颜色为绿色
color='#263F13')
# 在 y_pos 的位置上创建水平条形
plt.barh(y_pos,
# 使用数据 -x2
-x2,
# 中心对齐
align='center',
# 透明度为 0.4
alpha=0.4,
# 颜色为绿色
color='#77A61D')
# 注解和标签
plt.xlabel('Tina\'s Score: Light Green. Molly\'s Score: Dark Green')
t = plt.title('Comparison of Molly and Tina\'s Score')
plt.ylim([-1,len(x1)+0.1])
plt.xlim([-max(x2)-10, max(x1)+10])
plt.grid()
plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_back_to_back_bar_plot/matplotlib_back_to_back_bar_plot_6_0.png)
## MatPlotLib 中的条形图
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 创建数据帧
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'pre_score': [4, 24, 31, 2, 3],
'mid_score': [25, 94, 57, 62, 70],
'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df
```
| | first_name | pre_score | mid_score | post_score |
| --- | --- | --- | --- | --- |
| 0 | Jason | 4 | 25 | 5 |
| 1 | Molly | 24 | 94 | 43 |
| 2 | Tina | 31 | 57 | 23 |
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```py
# 为每个变量创建得分均值的列表
mean_values = [df['pre_score'].mean(), df['mid_score'].mean(), df['post_score'].mean()]
# 创建变动列表,设为得分上下 .25
variance = [df['pre_score'].mean() * 0.25, df['pre_score'].mean() * 0.25, df['pre_score'].mean() * 0.25]
# 设置条形标签
bar_labels = ['Pre Score', 'Mid Score', 'Post Score']
# 创建条形的 x 位置
x_pos = list(range(len(bar_labels)))
# 在 x 位置上创建条形图
plt.bar(x_pos,
# 使用 mean_values 中的数据
mean_values,
# y-error 直线设置为变动
yerr=variance,
# 中心对齐
align='center',
# 颜色
color='#FFC222',
# 透明度为 0.5
alpha=0.5)
# 添加网格
plt.grid()
# 设置 y 轴高度
max_y = max(zip(mean_values, variance)) # returns a tuple, here: (3, 5)
plt.ylim([0, (max_y[0] + max_y[1]) * 1.1])
# 设置轴标签和标题
plt.ylabel('Score')
plt.xticks(x_pos, bar_labels)
plt.title('Mean Scores For Each Test')
plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_bar_plot/matplotlib_bar_plot_6_0.png)
## Seaborn 中的调色板
```py
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# 创建数据帧
data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
'deaths_regiment_1': [34, 43, 14, 15, 15, 14, 31, 25, 62, 41],
'deaths_regiment_2': [52, 66, 78, 15, 15, 5, 25, 25, 86, 1],
'deaths_regiment_3': [13, 73, 82, 58, 52, 87, 26, 5, 56, 75],
'deaths_regiment_4': [44, 75, 26, 15, 15, 14, 54, 25, 24, 72],
'deaths_regiment_5': [25, 24, 25, 15, 57, 68, 21, 27, 62, 5],
'deaths_regiment_6': [84, 84, 26, 15, 15, 14, 26, 25, 62, 24],
'deaths_regiment_7': [46, 57, 26, 15, 15, 14, 26, 25, 62, 41]}
df = pd.DataFrame(data, columns = ['date', 'battle_deaths', 'deaths_regiment_1', 'deaths_regiment_2',
'deaths_regiment_3', 'deaths_regiment_4', 'deaths_regiment_5',
'deaths_regiment_6', 'deaths_regiment_7'])
df = df.set_index(df.date)
sns.palplot(sns.color_palette("deep", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_5_0.png)
```py
sns.palplot(sns.color_palette("muted", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_6_0.png)
```py
sns.palplot(sns.color_palette("bright", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_7_0.png)
```py
sns.palplot(sns.color_palette("dark", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_8_0.png)
```py
sns.palplot(sns.color_palette("colorblind", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_9_0.png)
```py
sns.palplot(sns.color_palette("Paired", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_10_0.png)
```py
sns.palplot(sns.color_palette("BuGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_11_0.png)
```py
sns.palplot(sns.color_palette("GnBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_12_0.png)
```py
sns.palplot(sns.color_palette("OrRd", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_13_0.png)
```py
sns.palplot(sns.color_palette("PuBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_14_0.png)
```py
sns.palplot(sns.color_palette("YlGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_15_0.png)
```py
sns.palplot(sns.color_palette("YlGnBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_16_0.png)
```py
sns.palplot(sns.color_palette("YlOrBr", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_17_0.png)
```py
sns.palplot(sns.color_palette("YlOrRd", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_18_0.png)
```py
sns.palplot(sns.color_palette("BrBG", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_19_0.png)
```py
sns.palplot(sns.color_palette("PiYG", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_20_0.png)
```py
sns.palplot(sns.color_palette("PRGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_21_0.png)
```py
sns.palplot(sns.color_palette("PuOr", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_22_0.png)
```py
sns.palplot(sns.color_palette("RdBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_23_0.png)
```py
sns.palplot(sns.color_palette("RdGy", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_24_0.png)
```py
sns.palplot(sns.color_palette("RdYlBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_25_0.png)
```py
sns.palplot(sns.color_palette("RdYlGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_26_0.png)
```py
sns.palplot(sns.color_palette("Spectral", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_27_0.png)
```py
# 创建调色板并将其设为当前调色板
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.set_palette(flatui)
sns.palplot(sns.color_palette())
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_29_0.png)
```py
# 设置绘图颜色
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="#34495e")
# <matplotlib.axes._subplots.AxesSubplot at 0x116f5db70>
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes/seaborn_color_palettes_31_1.png)
## 使用 Seaborn 和 pandas 创建时间序列绘图
```py
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
'deaths_regiment_1': [34, 43, 14, 15, 15, 14, 31, 25, 62, 41],
'deaths_regiment_2': [52, 66, 78, 15, 15, 5, 25, 25, 86, 1],
'deaths_regiment_3': [13, 73, 82, 58, 52, 87, 26, 5, 56, 75],
'deaths_regiment_4': [44, 75, 26, 15, 15, 14, 54, 25, 24, 72],
'deaths_regiment_5': [25, 24, 25, 15, 57, 68, 21, 27, 62, 5],
'deaths_regiment_6': [84, 84, 26, 15, 15, 14, 26, 25, 62, 24],
'deaths_regiment_7': [46, 57, 26, 15, 15, 14, 26, 25, 62, 41]}
df = pd.DataFrame(data, columns = ['date', 'battle_deaths', 'deaths_regiment_1', 'deaths_regiment_2',
'deaths_regiment_3', 'deaths_regiment_4', 'deaths_regiment_5',
'deaths_regiment_6', 'deaths_regiment_7'])
df = df.set_index(df.date)
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="indianred")
# <matplotlib.axes._subplots.AxesSubplot at 0x1140be780>
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot/seaborn_pandas_timeseries_plot_5_1.png)
```py
# 带有置信区间直线,但是没有直线的时间序列绘图
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], err_style="ci_bars", interpolate=False)
# <matplotlib.axes._subplots.AxesSubplot at 0x116400668>
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot/seaborn_pandas_timeseries_plot_7_1.png)
## 使用 Seaborn 创建散点图
```py
import pandas as pd
%matplotlib inline
import random
import matplotlib.pyplot as plt
import seaborn as sns
# 创建空数据帧
df = pd.DataFrame()
# 添加列
df['x'] = random.sample(range(1, 1000), 5)
df['y'] = random.sample(range(1, 1000), 5)
df['z'] = [1,0,0,1,0]
df['k'] = ['male','male','male','female','female']
# 查看前几行数据
df.head()
```
| | x | y | z | k |
| --- | --- | --- | --- | --- |
| 0 | 466 | 948 | 1 | male |
| 1 | 832 | 481 | 0 | male |
| 2 | 978 | 465 | 0 | male |
| 3 | 510 | 206 | 1 | female |
| 4 | 848 | 357 | 0 | female |
```py
# 设置散点图样式
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
# 创建数据帧的散点图
sns.lmplot('x', # 横轴
'y', # 纵轴
data=df, # 数据源
fit_reg=False, # 不要拟合回归直线
hue="z", # 设置颜色
scatter_kws={"marker": "D", # 设置标记样式
"s": 100}) # 设置标记大小
# 设置标题
plt.title('Histogram of IQ')
# 设置横轴标签
plt.xlabel('Time')
# 设置纵轴标签
plt.ylabel('Deaths')
# <matplotlib.text.Text at 0x112b7bb70>
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_scatterplot/seaborn_scatterplot_7_1.png)
## MatPlotLib 中的分组条形图
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'pre_score': [4, 24, 31, 2, 3],
'mid_score': [25, 94, 57, 62, 70],
'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df
```
| | first_name | pre_score | mid_score | post_score |
| --- | --- | --- | --- | --- |
| 0 | Jason | 4 | 25 | 5 |
| 1 | Molly | 24 | 94 | 43 |
| 2 | Tina | 31 | 57 | 23 |
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```py
# 设置条形的位置和宽度
pos = list(range(len(df['pre_score'])))
width = 0.25
# 绘制条形
fig, ax = plt.subplots(figsize=(10,5))
# 使用 pre_score 数据,
# 在位置 pos 上创建条形
plt.bar(pos,
# 使用数据 df['pre_score']
df['pre_score'],
# 宽度
width,
# 透明度为 0.5
alpha=0.5,
# 颜色
color='#EE3224',
# 标签是 first_name 的第一个值
label=df['first_name'][0])
# 使用 mid_score 数据,
# 在位置 pos + 一定宽度上创建条形
plt.bar([p + width for p in pos],
# 使用数据 df['mid_score']
df['mid_score'],
# 宽度
width,
# 透明度为 0.5
alpha=0.5,
# 颜色
color='#F78F1E',
# 标签是 first_name 的第二个值
label=df['first_name'][1])
# 使用 post_score 数据,
# 在位置 pos + 一定宽度上创建条形
plt.bar([p + width*2 for p in pos],
# 使用数据 df['post_score']
df['post_score'],
# 宽度
width,
# 透明度为 0.5
alpha=0.5,
# 颜色
color='#FFC222',
# 标签是 first_name 的第三个值
label=df['first_name'][2])
# 设置纵轴标签
ax.set_ylabel('Score')
# 设置标题
ax.set_title('Test Subject Scores')
# 设置 x 刻度的位置
ax.set_xticks([p + 1.5 * width for p in pos])
# 设置 x 刻度的标签
ax.set_xticklabels(df['first_name'])
# 设置横轴和纵轴的区域
plt.xlim(min(pos)-width, max(pos)+width*4)
plt.ylim([0, max(df['pre_score'] + df['mid_score'] + df['post_score'])] )
# 添加图例并展示绘图
plt.legend(['Pre Score', 'Mid Score', 'Post Score'], loc='upper left')
plt.grid()
plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_grouped_bar_plot/matplotlib_grouped_bar_plot_6_0.png)
## MatPlotLib 中的直方图
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
# 设置 ipython 的最大行数
pd.set_option('display.max_row', 1000)
# 将 ipython 的最大列宽设为 50
pd.set_option('display.max_columns', 50)
df = pd.read_csv('https://www.dropbox.com/s/52cb7kcflr8qm2u/5kings_battles_v1.csv?dl=1')
df.head()
```
| | name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | Battle of the Golden Tooth | 298 | 1 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 1 | 0 | 15000 | 4000 | Jaime Lannister | Clement Piper, Vance | 1 | Golden Tooth | The Westerlands | NaN |
| 1 | Battle at the Mummer's Ford | 298 | 2 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Baratheon | NaN | NaN | NaN | win | ambush | 1 | 0 | NaN | 120 | Gregor Clegane | Beric Dondarrion | 1 | Mummer's Ford | The Riverlands | NaN |
| 2 | Battle of Riverrun | 298 | 3 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 0 | 1 | 15000 | 10000 | Jaime Lannister, Andros Brax | Edmure Tully, Tytos Blackwood | 1 | Riverrun | The Riverlands | NaN |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1 | 1 | 18000 | 20000 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1 | Green Fork | The Riverlands | NaN |
| 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1 | 1 | 1875 | 6000 | Robb Stark, Brynden Tully | Jaime Lannister | 1 | Whispering Wood | The Riverlands | NaN |
```py
# 制作攻击方和防守方大小的两个变量
# 但是当有超过 10000 个攻击方时将其排除在外
data1 = df['attacker_size'][df['attacker_size'] < 90000]
data2 = df['defender_size'][df['attacker_size'] < 90000]
# 创建 2000 个桶
bins = np.arange(data1.min(), data2.max(), 2000) # 固定桶的大小
# 绘制攻击方大小的直方图
plt.hist(data1,
bins=bins,
alpha=0.5,
color='#EDD834',
label='Attacker')
# 绘制防守方大小的直方图
plt.hist(data2,
bins=bins,
alpha=0.5,
color='#887E43',
label='Defender')
# 设置图形的 x 和 y 边界
plt.ylim([0, 10])
# 设置标题和标签
plt.title('Histogram of Attacker and Defender Size')
plt.xlabel('Number of troops')
plt.ylabel('Number of battles')
plt.legend(loc='upper right')
plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram/matplotlib_histogram_6_0.png)
```py
# 制作攻击方和防守方大小的两个变量
# 但是当有超过 10000 个攻击方时将其排除在外
data1 = df['attacker_size'][df['attacker_size'] < 90000]
data2 = df['defender_size'][df['attacker_size'] < 90000]
# 创建 10 个桶,最小值为
# data1 和 data2 的最小值
bins = np.linspace(min(data1 + data2),
# 最大值为它们的最大值
max(data1 + data2),
# 并分为 10 个桶
10)
# 绘制攻击方大小的直方图
plt.hist(data1,
# 使用定义好的桶
bins=bins,
# 透明度
alpha=0.5,
# 颜色
color='#EDD834',
# 攻击方的标签
label='Attacker')
# 绘制防守方大小的直方图
plt.hist(data2,
# 使用定义好的桶
bins=bins,
# 透明度
alpha=0.5,
# 颜色
color='#887E43',
# 防守方的标签
label='Defender')
# 设置图形的 x 和 y 边界
plt.ylim([0, 10])
# 设置标题和标签
plt.title('Histogram of Attacker and Defender Size')
plt.xlabel('Number of troops')
plt.ylabel('Number of battles')
plt.legend(loc='upper right')
plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram/matplotlib_histogram_8_0.png)
## 从 Pandas 数据帧生成 MatPlotLib 散点图
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'female': [0, 1, 1, 0, 1],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'female', 'preTestScore', 'postTestScore'])
df
```
| | first_name | last_name | age | female | preTestScore | postTestScore |
| --- | --- | --- | --- | --- | --- | --- |
| 0 | Jason | Miller | 42 | 0 | 4 | 25 |
| 1 | Molly | Jacobson | 52 | 1 | 24 | 94 |
| 2 | Tina | Ali | 36 | 1 | 31 | 57 |
| 3 | Jake | Milner | 24 | 0 | 2 | 62 |
| 4 | Amy | Cooze | 73 | 1 | 3 | 70 |
```py
# preTestScore 和 postTestScore 的散点图
# 每个点的大小取决于年龄
plt.scatter(df.preTestScore, df.postTestScore
, s=df.age)
# <matplotlib.collections.PathCollection at 0x10ca42b00>
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas/matplotlib_scatterplot_from_pandas_6_1.png)
```py
# preTestScore 和 postTestScore 的散点图
# 大小为 300,颜色取决于性别
plt.scatter(df.preTestScore, df.postTestScore, s=300, c=df.female)
# <matplotlib.collections.PathCollection at 0x10cb90a90>
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas/matplotlib_scatterplot_from_pandas_8_1.png)
## Matplotlib 的简单示例
```py
# 让 Jupyter 加载 matplotlib
# 并内联创建所有绘图(也就是在页面上)
%matplotlib inline
import matplotlib.pyplot as pyplot
pyplot.plot([1.6, 2.7])
# [<matplotlib.lines.Line2D at 0x10c4e7978>]
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_simple_example/matplotlib_simple_example_6_1.png)
## MatPlotLib 中的饼图
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
raw_data = {'officer_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'jan_arrests': [4, 24, 31, 2, 3],
'feb_arrests': [25, 94, 57, 62, 70],
'march_arrests': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['officer_name', 'jan_arrests', 'feb_arrests', 'march_arrests'])
df
```
| | officer_name | jan_arrests | feb_arrests | march_arrests |
| --- | --- | --- | --- | --- |
| 0 | Jason | 4 | 25 | 5 |
| 1 | Molly | 24 | 94 | 43 |
| 2 | Tina | 31 | 57 | 23 |
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```py
# 创建一列,其中包含每个官员的总逮捕数
df['total_arrests'] = df['jan_arrests'] + df['feb_arrests'] + df['march_arrests']
df
```
| | officer_name | jan_arrests | feb_arrests | march_arrests | total_arrests |
| --- | --- | --- | --- | --- | --- |
| 0 | Jason | 4 | 25 | 5 | 34 |
| 1 | Molly | 24 | 94 | 43 | 161 |
| 2 | Tina | 31 | 57 | 23 | 111 |
| 3 | Jake | 2 | 62 | 23 | 87 |
| 4 | Amy | 3 | 70 | 51 | 124 |
```py
# (从 iWantHue)创建一列颜色
colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E"]
# 创建饼图
plt.pie(
# 使用数据 total_arrests
df['total_arrests'],
# 标签为官员名称
labels=df['officer_name'],
# 没有阴影
shadow=False,
# 颜色
colors=colors,
# 将一块扇形移出去
explode=(0, 0, 0, 0, 0.15),
# 起始角度为 90 度
startangle=90,
# 将百分比列为分数
autopct='%1.1f%%',
)
# 使饼状图为正圆
plt.axis('equal')
# 查看绘图
plt.tight_layout()
plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_pie_chart/matplotlib_pie_chart_7_0.png)
## MatPlotLib 中的散点图
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 展示 ipython 的最大行数
pd.set_option('display.max_row', 1000)
# 将 ipython 的最大列宽设为 50
pd.set_option('display.max_columns', 50)
df = pd.read_csv('https://raw.githubusercontent.com/chrisalbon/war_of_the_five_kings_dataset/master/5kings_battles_v1.csv')
df.head()
```
| | name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | Battle of the Golden Tooth | 298 | 1 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 1.0 | 0.0 | 15000.0 | 4000.0 | Jaime Lannister | Clement Piper, Vance | 1.0 | Golden Tooth | The Westerlands | NaN |
| 1 | Battle at the Mummer's Ford | 298 | 2 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Baratheon | NaN | NaN | NaN | win | ambush | 1.0 | 0.0 | NaN | 120.0 | Gregor Clegane | Beric Dondarrion | 1.0 | Mummer's Ford | The Riverlands | NaN |
| 2 | Battle of Riverrun | 298 | 3 | Joffrey/Tommen Baratheon | Robb Stark | Lannister | NaN | NaN | NaN | Tully | NaN | NaN | NaN | win | pitched battle | 0.0 | 1.0 | 15000.0 | 10000.0 | Jaime Lannister, Andros Brax | Edmure Tully, Tytos Blackwood | 1.0 | Riverrun | The Riverlands | NaN |
| 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1.0 | 1.0 | 18000.0 | 20000.0 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1.0 | Green Fork | The Riverlands | NaN |
| 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1.0 | 1.0 | 1875.0 | 6000.0 | Robb Stark, Brynden Tully | Jaime Lannister | 1.0 | Whispering Wood | The Riverlands | NaN |
```py
# 创建图形
plt.figure(figsize=(10,8))
# 创建散点图
# 298 年的攻击方大小为 x 轴
plt.scatter(df['attacker_size'][df['year'] == 298],
# 298 年的防守方大小为 y 轴
df['defender_size'][df['year'] == 298],
# 标记
marker='x',
# 颜色
color='b',
# 透明度
alpha=0.7,
# 大小
s = 124,
# 标签
label='Year 298')
# 299 年的攻击方大小为 x 轴
plt.scatter(df['attacker_size'][df['year'] == 299],
# 299 年的防守方大小为 y 轴
df['defender_size'][df['year'] == 299],
# 标记
marker='o',
# 颜色
color='r',
# 透明度
alpha=0.7,
# 大小
s = 124,
# 标签
label='Year 299')
# 300 年的攻击方大小为 x 轴
plt.scatter(df['attacker_size'][df['year'] == 300],
# 300 年的防守方大小为 x 轴
df['defender_size'][df['year'] == 300],
# 标记
marker='^',
# 颜色
color='g',
# 透明度
alpha=0.7,
# 大小
s = 124,
# 标签
label='Year 300')
# 标题
plt.title('Battles Of The War Of The Five Kings')
# y 标签
plt.ylabel('Defender Size')
# x 标签
plt.xlabel('Attacker Size')
# 图例
plt.legend(loc='upper right')
# 设置图形边界
plt.xlim([min(df['attacker_size'])-1000, max(df['attacker_size'])+1000])
plt.ylim([min(df['defender_size'])-1000, max(df['defender_size'])+1000])
plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_simple_scatterplot/matplotlib_simple_scatterplot_6_0.png)
## MatPlotLib 中的栈式百分比条形图
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'pre_score': [4, 24, 31, 2, 3],
'mid_score': [25, 94, 57, 62, 70],
'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df
```
| | first_name | pre_score | mid_score | post_score |
| --- | --- | --- | --- | --- |
| 0 | Jason | 4 | 25 | 5 |
| 1 | Molly | 24 | 94 | 43 |
| 2 | Tina | 31 | 57 | 23 |
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```py
# 创建带有一个子图的图形
f, ax = plt.subplots(1, figsize=(10,5))
# 将条宽设为 1
bar_width = 1
# 条形左边界的位置
bar_l = [i for i in range(len(df['pre_score']))]
# x 轴刻度的位置(条形的中心是条形标签)
tick_pos = [i+(bar_width/2) for i in bar_l]
# 创建每个参与者的总得分
totals = [i+j+k for i,j,k in zip(df['pre_score'], df['mid_score'], df['post_score'])]
# 创建每个参与者的 pre_score 和总得分的百分比
pre_rel = [i / j * 100 for i,j in zip(df['pre_score'], totals)]
# 创建每个参与者的 mid_score 和总得分的百分比
mid_rel = [i / j * 100 for i,j in zip(df['mid_score'], totals)]
# 创建每个参与者的 post_score 和总得分的百分比
post_rel = [i / j * 100 for i,j in zip(df['post_score'], totals)]
# 在位置 bar_1 创建条形图
ax.bar(bar_l,
# 使用数据 pre_rel
pre_rel,
# 标签
label='Pre Score',
# 透明度
alpha=0.9,
# 颜色
color='#019600',
# 条形宽度
width=bar_width,
# 边框颜色
edgecolor='white'
)
# 在位置 bar_1 创建条形图
ax.bar(bar_l,
# 使用数据 mid_rel
mid_rel,
# 底部为 pre_rel
bottom=pre_rel,
# 标签
label='Mid Score',
# 透明度
alpha=0.9,
# 颜色
color='#3C5F5A',
# 条形宽度
width=bar_width,
# 边框颜色
edgecolor='white'
)
# Create a bar chart in position bar_1
ax.bar(bar_l,
# 使用数据 post_rel
post_rel,
# 底部为 pre_rel 和 mid_rel
bottom=[i+j for i,j in zip(pre_rel, mid_rel)],
# 标签
label='Post Score',
# 透明度
alpha=0.9,
# 颜色
color='#219AD8',
# 条形宽度
width=bar_width,
# 边框颜色
edgecolor='white'
)
# 将刻度设为 first_name
plt.xticks(tick_pos, df['first_name'])
ax.set_ylabel("Percentage")
ax.set_xlabel("")
# 设置图形边界
plt.xlim([min(tick_pos)-bar_width, max(tick_pos)+bar_width])
plt.ylim(-10, 110)
# 旋转轴标签
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
# 展示绘图
plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_percentage_stacked_bar_plot/matplotlib_percentage_stacked_bar_plot_6_0.png)