💎一站式轻松地调用各大LLM模型接口,支持GPT4、智谱、星火、月之暗面及文生图 广告
## 2.1 井下温度缺失值和异常值处理 ``` import numpy as np temperature_str = np.loadtxt('ug_detect.csv',\ dtype = bytes, \ delimiter=',',\ skiprows=1,\ usecols=(1),\ unpack = False) print("读取出的数组是temperature_str:\n", \ temperature_str) temperature = np.ndarray( len(temperature_str) ) for index in range(0, len(temperature_str)) : item = temperature_str[index] if item != b"": item = item.decode( 'gb2312' ) item = float( item ) else: item = None temperature[index] = item for index in range(0, len(temperature)) : item = temperature[index] if item >= 500.0: item = None temperature[index] = item print("温度是:\n", temperature) import matplotlib.pyplot as plt t = np.arange( len( temperature )) plt.plot(t,temperature) plt.plot(t,temperature,'pr') plt.show() def bisec(dataArray): for index in range(0, len(dataArray)) : if np.isnan ( dataArray[index]): dataArray[index] = 0.5 * ( dataArray[index - 1] + dataArray[index + 1] ) bisec(temperature) t = np.arange( len( temperature )) plt.plot(t,temperature) plt.plot(t,temperature,'pr') plt.show() import time import random while True: print("aaa") time.sleep(5) ``` ## 2.2 使用pandas ``` import pandas as pd import matplotlib.pyplot as plt import scipy.interpolate as itp ug_data = pd.read_csv('ug_detect.csv',\ header = 0, \ encoding='gb2312') temperature_data = ug_data[u'温度(?C)'] humidity_data = ug_data[u'相对湿度'] gas_data = ug_data[u'瓦斯(m?/min)'] co_data = ug_data[u'一氧化碳(m?/min)'] #寻找异常值并设置为None def defectsCop(data_series, threshold): for index in range(0, len(data_series)): item = data_series[index] if item >= float(threshold): item = None data_series[index] = item def seriesItp(data_series): for index in range(0, len(data_series)) : item = data_series[index] if pd.isnull( data_series[index] ): x_list = [index - 1, index + 1] y_list = [ data_series[index - 1],\ data_series[index + 1]] lagrange_poly = itp.lagrange(x_list, y_list) data_series[index] = lagrange_poly(index) defectsCop(temperature_data, 60) defectsCop(humidity_data, 200) defectsCop(gas_data, 100) defectsCop(co_data, 100) seriesItp(temperature_data) seriesItp(humidity_data) seriesItp(gas_data) seriesItp(co_data) all_data = pd.DataFrame(\ {"温度":temperature_data,\ "相对湿度":humidity_data,\ "瓦斯浓度":gas_data, \ "一氧化碳浓度":co_data}) all_data.to_csv('all_data_pandas.csv',\ index = False, \ encoding='gb2312') ``` ## 3.1 歌词处理 ``` # 1 句频统计 import re from collections import Counter from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation # 读取歌词文件 with open('jaychou_lyrics.txt', 'r', encoding='utf-8') as f: lyrics = f.read() # 分句 words = re.findall(r'\w+', lyrics) # 统计句频 word_count = Counter(words) print("Top 10 words:") for word, count in word_count.most_common(10): print(f"{word}: {count}") #2 提词器 import pandas as pd # 假设txt文件名为'jay_lyrics.txt' file_name = 'jaychou_lyrics.txt' # 读取txt文件到pandas Series with open(file_name, 'r', encoding='utf-8') as f: lyrics = pd.Series(f.read().splitlines()) # 使用splitlines()按行分割 # 创建一个函数来查找并返回下一句歌词 def get_next_line(input_line): # 尝试找到输入的歌词在Series中的索引 index = lyrics[lyrics == input_line].index.min() # 检查是否找到了歌词并且不是最后一行 if not pd.isnull(index) and index < len(lyrics) - 1: # 返回下一句歌词 return lyrics.iloc[index + 1] else: # 如果没有找到或者已经是最后一行,返回相应信息 return "未找到该句歌词或已经是最后一句了。" # 用户输入歌词 user_input = input("请输入一句歌词:") # 调用函数并输出结果 print(get_next_line(user_input)) ``` ## 4.1 幸福指数 ### 补充map小练习 ``` import pandas as pd # 假设我们有一个DataFrame,其中一列名为'ChineseWords' data = { 'ChineseWords': ['你好', '谢谢', '再见'] } df = pd.DataFrame(data) # 创建一个字典作为翻译表 translation_dict = { '你好': 'Hello', '谢谢': 'Thank you', '再见': 'Goodbye' } # 使用map方法将'ChineseWords'列中的值翻译成英文 df['EnglishWords'] = df['ChineseWords'].map(translation_dict) # 打印翻译后的DataFrame print(df) ``` ``` import pandas as pd # 假设数据已经加载到DataFrame中,名为df df = pd.read_excel('happy.xls').dropna() # 如果数据是从CSV文件加载 # 数据清洗和预处理 # 检查缺失值 print(df.isnull().sum()) # 将分类变量转换为数值型 df['性别'] = df['性别'].map({'男': 1, '女': 0}) # 假设还有女性数据 df['是否城市'] = df['是否城市'].map({'城市': 1, '农村': 0}) df['婚姻状况'] = df['婚姻状况'].map({'已婚': 1, '未婚': 0}) # 假设还有其他婚姻状况 df['健康状况'] = df['健康状况'].map({'是': 1, '否': 0}) # 假设健康状况有'是'和'否'两种 df['公共服务态度'] = df['公共服务态度'].map({'满意': 1, '不满意': 0}) # 假设还有'不满意'选项 # 去除不需要的列 df = df.drop(['编号', '调查时间'], axis=1) # 查看预处理后的数据 print(df.head(100)) # 任务二 from sklearn.preprocessing import StandardScaler # 创建新特征:年龄和总收入 df['年龄'] = 2023 - df['出生年'] df['总收入'] = df['个人收入'] + df['家庭收入'] # 删除原始的个人收入和家庭收入列(可选) df = df.drop(['个人收入', '家庭收入'], axis=1) 数据标准化 scaler = StandardScaler() df[['年龄', '总收入']] = scaler.fit_transform(df[['年龄', '总收入']]) # 查看带有新特征的数据 print(df.head()) ``` ## 5.3 内、外、左、右连接——合并母婴购物数据 ``` import pandas as pd # 假设数据已经加载到DataFrame中,名为df mum_baby = pd.read_csv('mum_baby.csv').dropna() # 如果数据是从CSV文件加载 trade_history = pd.read_csv('trade_history.csv').dropna() # 如果数据是从CSV文件加载 pd.merge(mum_baby, trade_history, on='user_id', how='right') ```