数据预处理与特征工程
机器学习数据预处理全流程:清洗、转换、特征选择与特征构造。
数据预处理和特征工程是机器学习项目中最重要的环节,直接决定模型性能。本文将详细介绍完整的数据处理流程。
数据预处理流程
原始数据 → 数据清洗 → 数据转换 → 特征工程 → 特征选择 → 模型训练
数据加载与探索
加载数据
import pandas as pd
import numpy as np
# 读取 CSV
df = pd.read_csv('data.csv')
# 读取 Excel
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# 读取 JSON
df = pd.read_json('data.json')
# 基本信息
print(df.shape)
print(df.info())
print(df.describe())
探索性数据分析 (EDA)
import matplotlib.pyplot as plt
import seaborn as sns
# 查看前几行
df.head()
# 数据类型
df.dtypes
# 缺失值统计
df.isnull().sum()
# 数值列分布
df.describe()
# 分布可视化
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# 直方图
df['age'].hist(ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution')
# 箱线图
df.boxplot(column='salary', ax=axes[0, 1])
axes[0, 1].set_title('Salary Boxplot')
# 相关性热力图
sns.heatmap(df.corr(), annot=True, ax=axes[1, 0])
axes[1, 0].set_title('Correlation Matrix')
# 类别分布
df['category'].value_counts().plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Category Distribution')
plt.tight_layout()
plt.show()
数据清洗
处理缺失值
# 检查缺失值
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
print(pd.DataFrame({'count': missing, 'percent': missing_pct}))
# 删除缺失值
df_dropped = df.dropna() # 删除任何有缺失的行
df_dropped = df.dropna(subset=['important_col']) # 只看特定列
# 填充缺失值
df['age'].fillna(df['age'].mean(), inplace=True) # 均值填充
df['category'].fillna(df['category'].mode()[0], inplace=True) # 众数填充
df['salary'].fillna(df['salary'].median(), inplace=True) # 中位数填充
# 前向/后向填充 (时序数据)
df['value'].fillna(method='ffill', inplace=True)
df['value'].fillna(method='bfill', inplace=True)
# 插值填充
df['value'].interpolate(method='linear', inplace=True)
处理重复值
# 检查重复
duplicates = df.duplicated().sum()
print(f"Duplicates: {duplicates}")
# 删除重复
df = df.drop_duplicates()
# 基于特定列去重
df = df.drop_duplicates(subset=['id', 'date'], keep='first')
处理异常值
# IQR 方法检测异常值
def detect_outliers_iqr(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df[column] < lower) | (df[column] > upper)]
return outliers, lower, upper
outliers, lower, upper = detect_outliers_iqr(df, 'salary')
print(f"Outliers: {len(outliers)}")
# 处理异常值
# 方法1: 删除
df = df[(df['salary'] >= lower) & (df['salary'] <= upper)]
# 方法2: 截断
df['salary'] = df['salary'].clip(lower=lower, upper=upper)
# 方法3: 替换为边界值
df.loc[df['salary'] < lower, 'salary'] = lower
df.loc[df['salary'] > upper, 'salary'] = upper
数据转换
数值型特征处理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
# 标准化 (Z-score)
scaler = StandardScaler()
df['salary_scaled'] = scaler.fit_transform(df[['salary']])
# 归一化 (Min-Max)
scaler = MinMaxScaler()
df['age_normalized'] = scaler.fit_transform(df[['age']])
# 鲁棒缩放 (对异常值不敏感)
scaler = RobustScaler()
df['income_robust'] = scaler.fit_transform(df[['income']])
# 对数变换 (处理偏斜分布)
df['salary_log'] = np.log1p(df['salary'])
# Box-Cox 变换
from scipy import stats
df['salary_boxcox'], _ = stats.boxcox(df['salary'] + 1)
类别型特征处理
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Label Encoding (有序类别)
le = LabelEncoder()
df['education_encoded'] = le.fit_transform(df['education'])
# One-Hot Encoding (无序类别)
df_encoded = pd.get_dummies(df, columns=['city', 'gender'], drop_first=True)
# 使用 sklearn
ohe = OneHotEncoder(sparse=False, drop='first')
encoded = ohe.fit_transform(df[['city']])
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out())
# 目标编码 (Target Encoding)
def target_encode(df, column, target):
means = df.groupby(column)[target].mean()
return df[column].map(means)
df['city_target_encoded'] = target_encode(df, 'city', 'salary')
日期特征处理
# 转换为日期类型
df['date'] = pd.to_datetime(df['date'])
# 提取日期特征
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
df['quarter'] = df['date'].dt.quarter
# 计算时间差
df['days_since'] = (pd.Timestamp.now() - df['date']).dt.days
特征工程
特征构造
# 数值组合
df['price_per_sqft'] = df['price'] / df['area']
df['total_rooms'] = df['bedrooms'] + df['bathrooms']
# 多项式特征
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[['age', 'income']])
# 分箱 (Binning)
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 65, 100],
labels=['少年', '青年', '中年', '中老年', '老年'])
# 等频分箱
df['income_quantile'] = pd.qcut(df['income'], q=5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'])
# 交互特征
df['age_income'] = df['age'] * df['income']
文本特征
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# 词频统计
vectorizer = CountVectorizer(max_features=1000)
text_features = vectorizer.fit_transform(df['description'])
# TF-IDF
tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
tfidf_features = tfidf.fit_transform(df['description'])
# 基本文本统计
df['text_length'] = df['description'].str.len()
df['word_count'] = df['description'].str.split().str.len()
特征选择
过滤法
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
# 方差阈值
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X)
# 卡方检验 (分类)
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)
# 互信息
selector = SelectKBest(mutual_info_classif, k=10)
X_selected = selector.fit_transform(X, y)
# 相关性过滤
correlation_matrix = df.corr()
high_corr = (correlation_matrix.abs() > 0.8) & (correlation_matrix != 1)
包装法
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# 递归特征消除
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=10)
X_selected = rfe.fit_transform(X, y)
selected_features = X.columns[rfe.support_]
嵌入法
from sklearn.ensemble import RandomForestClassifier
# 基于模型的特征重要性
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)
# 特征重要性
importance = pd.DataFrame({
'feature': X.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(importance.head(10))
# L1 正则化选择
from sklearn.linear_model import LassoCV
lasso = LassoCV()
lasso.fit(X, y)
selected = X.columns[lasso.coef_ != 0]
数据管道
Pipeline 构建
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# 定义预处理管道
numeric_features = ['age', 'income', 'score']
categorical_features = ['city', 'education']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# 完整管道
full_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
# 训练
full_pipeline.fit(X_train, y_train)
# 预测
predictions = full_pipeline.predict(X_test)
总结
数据预处理的关键步骤:
- 数据探索: 了解数据分布和质量
- 数据清洗: 处理缺失值、异常值、重复值
- 数据转换: 标准化、编码、变换
- 特征工程: 构造有意义的新特征
- 特征选择: 选择最相关的特征
- Pipeline: 构建可复用的处理流程
下一篇将详细介绍监督学习中的回归算法。