企业🤖AI智能体构建引擎,智能编排和调试,一键部署,支持私有化部署方案 广告
# 六、日期时间预处理 > 作者:[Chris Albon](https://chrisalbon.com/) > > 译者:[飞龙](https://github.com/wizardforcel) > > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) ## 把日期和时间拆成多个特征 ```py # 加载库 import pandas as pd # 创建数据帧 df = pd.DataFrame() # 创建五个日期 df['date'] = pd.date_range('1/1/2001', periods=150, freq='W') # 为年月日,时分秒创建特征 df['year'] = df['date'].dt.year df['month'] = df['date'].dt.month df['day'] = df['date'].dt.day df['hour'] = df['date'].dt.hour df['minute'] = df['date'].dt.minute # 展示三行 df.head(3) ``` | | date | year | month | day | hour | minute | | --- | --- | --- | --- | --- | --- | --- | | 0 | 2001-01-07 | 2001 | 1 | 7 | 0 | 0 | | 1 | 2001-01-14 | 2001 | 1 | 14 | 0 | 0 | | 2 | 2001-01-21 | 2001 | 1 | 21 | 0 | 0 | ## 计算日期时间之间的差 ```py # 加载库 import pandas as pd # 创建数据帧 df = pd.DataFrame() # 创建两个 datetime 特征 df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')] df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')] # 计算特征之间的间隔 df['Left'] - df['Arrived'] ''' 0 0 days 1 2 days dtype: timedelta64[ns] ''' # 计算特征之间的间隔 pd.Series(delta.days for delta in (df['Left'] - df['Arrived'])) ''' 0 0 1 2 dtype: int64 ''' ``` ## 将字符串转换为日期 ```py # 加载库 import numpy as np import pandas as pd # 创建字符串 date_strings = np.array(['03-04-2005 11:35 PM', '23-05-2010 12:01 AM', '04-09-2009 09:09 PM']) ``` 如果`errors="coerce"`那么任何问题都不会产生错误(默认行为),而是将导致错误的值设置为`NaT`(即缺失值)。 | 代码 | 描述 | 示例 | | --- | --- | --- | | ` %Y ` | 整年 | `2001` | | ` %m ` | 零填充的月份 | `04` | | ` %d ` | 零填充的日期 | `09` | | ` %I ` | 零填充的小时(12 小时) | `02` | | ` %p ` | AM 或 PM | `AM` | | ` %M ` | 零填充的分钟 | `05` | | ` %S ` | 零填充的秒钟 | `09` | ```py # 转换为 datetime [pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce") for date in date_strings] ''' [Timestamp('2005-04-03 23:35:00'), Timestamp('2010-05-23 00:01:00'), Timestamp('2009-09-04 21:09:00')] ''' ``` ## 转换 pandas 列的时区 ```py # 加载库 import pandas as pd from pytz import all_timezones # 展示十个时区 all_timezones[0:10] ''' ['Africa/Abidjan', 'Africa/Accra', 'Africa/Addis_Ababa', 'Africa/Algiers', 'Africa/Asmara', 'Africa/Asmera', 'Africa/Bamako', 'Africa/Bangui', 'Africa/Banjul', 'Africa/Bissau'] ''' # 创建十个日期 dates = pd.Series(pd.date_range('2/2/2002', periods=10, freq='M')) # 设置时区 dates_with_abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan') # 查看 pandas 序列 dates_with_abidjan_time_zone ''' 0 2002-02-28 00:00:00+00:00 1 2002-03-31 00:00:00+00:00 2 2002-04-30 00:00:00+00:00 3 2002-05-31 00:00:00+00:00 4 2002-06-30 00:00:00+00:00 5 2002-07-31 00:00:00+00:00 6 2002-08-31 00:00:00+00:00 7 2002-09-30 00:00:00+00:00 8 2002-10-31 00:00:00+00:00 9 2002-11-30 00:00:00+00:00 dtype: datetime64[ns, Africa/Abidjan] ''' # 转换时区 dates_with_london_time_zone = dates_with_abidjan_time_zone.dt.tz_convert('Europe/London') # 查看 pandas 序列 dates_with_london_time_zone ''' 0 2002-02-28 00:00:00+00:00 1 2002-03-31 00:00:00+00:00 2 2002-04-30 01:00:00+01:00 3 2002-05-31 01:00:00+01:00 4 2002-06-30 01:00:00+01:00 5 2002-07-31 01:00:00+01:00 6 2002-08-31 01:00:00+01:00 7 2002-09-30 01:00:00+01:00 8 2002-10-31 00:00:00+00:00 9 2002-11-30 00:00:00+00:00 dtype: datetime64[ns, Europe/London] ''' ``` ## 编码星期 ```py # 加载库 import pandas as pd # 创建数据集 dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M')) # 查看数据 dates ''' 0 2002-02-28 1 2002-03-31 2 2002-04-30 dtype: datetime64[ns] ''' # 查看星期 dates.dt.weekday_name ''' 0 Thursday 1 Sunday 2 Tuesday dtype: object ''' ``` ## 处理时间序列中的缺失值 ```py # 加载库 import pandas as pd import numpy as np # 创建日期 time_index = pd.date_range('01/01/2010', periods=5, freq='M') # 创建数据帧,设置索引 df = pd.DataFrame(index=time_index) # 创建带有一些缺失值的特征 df['Sales'] = [1.0,2.0,np.nan,np.nan,5.0] # 对缺失值执行插值 df.interpolate() ``` | | Sales | | --- | --- | | 2010-01-31 | 1.0 | | 2010-02-28 | 2.0 | | 2010-03-31 | 3.0 | | 2010-04-30 | 4.0 | | 2010-05-31 | 5.0 | ```py # 前向填充 df.ffill() ``` | | Sales | | --- | --- | | 2010-01-31 | 1.0 | | 2010-02-28 | 2.0 | | 2010-03-31 | 2.0 | | 2010-04-30 | 2.0 | | 2010-05-31 | 5.0 | ```py # 后向填充 df.bfill() ``` | | Sales | | --- | --- | | 2010-01-31 | 1.0 | | 2010-02-28 | 2.0 | | 2010-03-31 | 5.0 | | 2010-04-30 | 5.0 | | 2010-05-31 | 5.0 | ```py # 对缺失值执行插值 df.interpolate(limit=1, limit_direction='forward') ``` | | Sales | | --- | --- | | 2010-01-31 | 1.0 | | 2010-02-28 | 2.0 | | 2010-03-31 | 3.0 | | 2010-04-30 | NaN | | 2010-05-31 | 5.0 | ## 处理时区 ```py # 加载库 import pandas as pd from pytz import all_timezones # 展示十个时区 all_timezones[0:10] ''' ['Africa/Abidjan', 'Africa/Accra', 'Africa/Addis_Ababa', 'Africa/Algiers', 'Africa/Asmara', 'Africa/Asmera', 'Africa/Bamako', 'Africa/Bangui', 'Africa/Banjul', 'Africa/Bissau'] ''' # 创建 datetime pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London') # Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London') # 创建 datetime date = pd.Timestamp('2017-05-01 06:00:00') # 设置时区 date_in_london = date.tz_localize('Europe/London') # 修改时区 date_in_london.tz_convert('Africa/Abidjan') # Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan') ``` ## 平移时间特征 ```py # 加载库 import pandas as pd # 创建数据帧 df = pd.DataFrame() # 创建数据 df['dates'] = pd.date_range('1/1/2001', periods=5, freq='D') df['stock_price'] = [1.1,2.2,3.3,4.4,5.5] # 将值平移一行 df['previous_days_stock_price'] = df['stock_price'].shift(1) # 展示数据帧 df ``` | | dates | stock_price | previous_days_stock_price | | --- | --- | --- | --- | | 0 | 2001-01-01 | 1.1 | NaN | | 1 | 2001-01-02 | 2.2 | 1.1 | | 2 | 2001-01-03 | 3.3 | 2.2 | | 3 | 2001-01-04 | 4.4 | 3.3 | | 4 | 2001-01-05 | 5.5 | 4.4 | ## 滑动时间窗口 ```py # 加载库 import pandas as pd # 创建 datetime time_index = pd.date_range('01/01/2010', periods=5, freq='M') # 创建数据帧,设置索引 df = pd.DataFrame(index=time_index) # 创建特征 df['Stock_Price'] = [1,2,3,4,5] # 计算滑动均值 df.rolling(window=2).mean() ``` | | Stock_Price | | --- | --- | | 2010-01-31 | NaN | | 2010-02-28 | 1.5 | | 2010-03-31 | 2.5 | | 2010-04-30 | 3.5 | | 2010-05-31 | 4.5 | ```py # 识别滑动时间窗口中的最大值 df.rolling(window=2).max() ``` | | Stock_Price | | --- | --- | | 2010-01-31 | NaN | | 2010-02-28 | 2.0 | | 2010-03-31 | 3.0 | | 2010-04-30 | 4.0 | | 2010-05-31 | 5.0 | ## 选择日期时间范围 ```py # 加载库 import pandas as pd # 创建数据帧 df = pd.DataFrame() # 创建 datetime df['date'] = pd.date_range('1/1/2001', periods=100000, freq='H') ``` 如果数据帧未按时间索引,请使用此方法。 ```py # 选择两个日期时间之间的观测 df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')] ``` | | date | | --- | --- | | 8762 | 2002-01-01 02:00:00 | | 8763 | 2002-01-01 03:00:00 | | 8764 | 2002-01-01 04:00:00 | 如果数据帧按时间索引,请使用此方法。 ```py # 设置索引 df = df.set_index(df['date']) # 选择两个日期时间之间的观测 df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00'] ``` | | date | | --- | --- | | date | | | 2002-01-01 01:00:00 | 2002-01-01 01:00:00 | | 2002-01-01 02:00:00 | 2002-01-01 02:00:00 | | 2002-01-01 03:00:00 | 2002-01-01 03:00:00 | | 2002-01-01 04:00:00 | 2002-01-01 04:00:00 |