## 2.1 井下温度缺失值和异常值处理
```
import numpy as np
temperature_str = np.loadtxt('ug_detect.csv',\
dtype = bytes, \
delimiter=',',\
skiprows=1,\
usecols=(1),\
unpack = False)
print("读取出的数组是temperature_str:\n", \
temperature_str)
temperature = np.ndarray( len(temperature_str) )
for index in range(0, len(temperature_str)) :
item = temperature_str[index]
if item != b"":
item = item.decode( 'gb2312' )
item = float( item )
else:
item = None
temperature[index] = item
for index in range(0, len(temperature)) :
item = temperature[index]
if item >= 500.0:
item = None
temperature[index] = item
print("温度是:\n", temperature)
import matplotlib.pyplot as plt
t = np.arange( len( temperature ))
plt.plot(t,temperature)
plt.plot(t,temperature,'pr')
plt.show()
def bisec(dataArray):
for index in range(0, len(dataArray)) :
if np.isnan ( dataArray[index]):
dataArray[index] = 0.5 * ( dataArray[index - 1] + dataArray[index + 1] )
bisec(temperature)
t = np.arange( len( temperature ))
plt.plot(t,temperature)
plt.plot(t,temperature,'pr')
plt.show()
import time
import random
while True:
print("aaa")
time.sleep(5)
```
## 2.2 使用pandas
```
import pandas as pd
import matplotlib.pyplot as plt
import scipy.interpolate as itp
ug_data = pd.read_csv('ug_detect.csv',\
header = 0, \
encoding='gb2312')
temperature_data = ug_data[u'温度(?C)']
humidity_data = ug_data[u'相对湿度']
gas_data = ug_data[u'瓦斯(m?/min)']
co_data = ug_data[u'一氧化碳(m?/min)']
#寻找异常值并设置为None
def defectsCop(data_series, threshold):
for index in range(0, len(data_series)):
item = data_series[index]
if item >= float(threshold):
item = None
data_series[index] = item
def seriesItp(data_series):
for index in range(0, len(data_series)) :
item = data_series[index]
if pd.isnull( data_series[index] ):
x_list = [index - 1, index + 1]
y_list = [ data_series[index - 1],\
data_series[index + 1]]
lagrange_poly = itp.lagrange(x_list, y_list)
data_series[index] = lagrange_poly(index)
defectsCop(temperature_data, 60)
defectsCop(humidity_data, 200)
defectsCop(gas_data, 100)
defectsCop(co_data, 100)
seriesItp(temperature_data)
seriesItp(humidity_data)
seriesItp(gas_data)
seriesItp(co_data)
all_data = pd.DataFrame(\
{"温度":temperature_data,\
"相对湿度":humidity_data,\
"瓦斯浓度":gas_data, \
"一氧化碳浓度":co_data})
all_data.to_csv('all_data_pandas.csv',\
index = False, \
encoding='gb2312')
```
## 3.1 歌词处理
```
# 1 句频统计
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# 读取歌词文件
with open('jaychou_lyrics.txt', 'r', encoding='utf-8') as f:
lyrics = f.read()
# 分句
words = re.findall(r'\w+', lyrics)
# 统计句频
word_count = Counter(words)
print("Top 10 words:")
for word, count in word_count.most_common(10):
print(f"{word}: {count}")
#2 提词器
import pandas as pd
# 假设txt文件名为'jay_lyrics.txt'
file_name = 'jaychou_lyrics.txt'
# 读取txt文件到pandas Series
with open(file_name, 'r', encoding='utf-8') as f:
lyrics = pd.Series(f.read().splitlines()) # 使用splitlines()按行分割
# 创建一个函数来查找并返回下一句歌词
def get_next_line(input_line):
# 尝试找到输入的歌词在Series中的索引
index = lyrics[lyrics == input_line].index.min()
# 检查是否找到了歌词并且不是最后一行
if not pd.isnull(index) and index < len(lyrics) - 1:
# 返回下一句歌词
return lyrics.iloc[index + 1]
else:
# 如果没有找到或者已经是最后一行,返回相应信息
return "未找到该句歌词或已经是最后一句了。"
# 用户输入歌词
user_input = input("请输入一句歌词:")
# 调用函数并输出结果
print(get_next_line(user_input))
```
## 4.1 幸福指数
### 补充map小练习
```
import pandas as pd
# 假设我们有一个DataFrame,其中一列名为'ChineseWords'
data = {
'ChineseWords': ['你好', '谢谢', '再见']
}
df = pd.DataFrame(data)
# 创建一个字典作为翻译表
translation_dict = {
'你好': 'Hello',
'谢谢': 'Thank you',
'再见': 'Goodbye'
}
# 使用map方法将'ChineseWords'列中的值翻译成英文
df['EnglishWords'] = df['ChineseWords'].map(translation_dict)
# 打印翻译后的DataFrame
print(df)
```
```
import pandas as pd
# 假设数据已经加载到DataFrame中,名为df
df = pd.read_excel('happy.xls').dropna() # 如果数据是从CSV文件加载
# 数据清洗和预处理
# 检查缺失值
print(df.isnull().sum())
# 将分类变量转换为数值型
df['性别'] = df['性别'].map({'男': 1, '女': 0}) # 假设还有女性数据
df['是否城市'] = df['是否城市'].map({'城市': 1, '农村': 0})
df['婚姻状况'] = df['婚姻状况'].map({'已婚': 1, '未婚': 0}) # 假设还有其他婚姻状况
df['健康状况'] = df['健康状况'].map({'是': 1, '否': 0}) # 假设健康状况有'是'和'否'两种
df['公共服务态度'] = df['公共服务态度'].map({'满意': 1, '不满意': 0}) # 假设还有'不满意'选项
# 去除不需要的列
df = df.drop(['编号', '调查时间'], axis=1)
# 查看预处理后的数据
print(df.head(100))
# 任务二
from sklearn.preprocessing import StandardScaler
# 创建新特征:年龄和总收入
df['年龄'] = 2023 - df['出生年']
df['总收入'] = df['个人收入'] + df['家庭收入']
# 删除原始的个人收入和家庭收入列(可选)
df = df.drop(['个人收入', '家庭收入'], axis=1)
数据标准化
scaler = StandardScaler()
df[['年龄', '总收入']] = scaler.fit_transform(df[['年龄', '总收入']])
# 查看带有新特征的数据
print(df.head())
```
## 5.3 内、外、左、右连接——合并母婴购物数据
```
import pandas as pd
# 假设数据已经加载到DataFrame中,名为df
mum_baby = pd.read_csv('mum_baby.csv').dropna() # 如果数据是从CSV文件加载
trade_history = pd.read_csv('trade_history.csv').dropna() # 如果数据是从CSV文件加载
pd.merge(mum_baby, trade_history, on='user_id', how='right')
```