Skip to main content

特徵選擇_Deta_Selection_鐵人賽示範_

Open In Colab

17.特徵選擇

下載及整理資料

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, SelectFromModel, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# 下載鐵達尼號資料集
!wget -O data.csv https://raw.githubusercontent.com/duxuhao/Feature-Selection/master/example/titanic/clean_train.csv
df = pd.read_csv('data.csv')
df.describe(include='all')
# 確認有無缺失值
df.isna().sum()
# 特徵工程 (One-hot encoding)
df = pd.concat(
[df,pd.get_dummies(df["Title"])],
axis=1
)
df = df.drop("Title", 1)
# 切分特徵X與標籤Y
X = df.drop("Survived", 1)
Y = df["Survived"]
X.head()

定義評估模型

  • 本範例採具有階層性的sklearn.ensemble.RandomForestClassifier訓練資料,評估結果為驗證資料集的metrics。
def use_RandomForestClassifier_evaluation_metrics_on_test_set(X,Y):
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size = 0.2 ,stratify=Y, random_state = 9527)

# 標準化
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# RandomForestClassifier訓練模型
model = RandomForestClassifier(criterion='entropy', random_state=9527)
model.fit(X_train_scaled, Y_train)

# 預測
y_predict_result = model.predict(X_test_scaled)

# 回傳evaluation_metrics_on_test_set
return {
'accuracy' : accuracy_score(Y_test, y_predict_result),
'roc' : roc_auc_score(Y_test, y_predict_result),
'precision' : precision_score(Y_test, y_predict_result),
'recall' : recall_score(Y_test, y_predict_result),
'f1' : f1_score(Y_test, y_predict_result),
'Feature Count' : len(X.columns)
}

全特徵原始成效

res = pd.DataFrame(use_RandomForestClassifier_evaluation_metrics_on_test_set(X,Y), index=['ALL'])
res
# Correlation Matrix
plt.figure(figsize=(15,15))
cor = df.corr()
sns.heatmap(cor, annot=True)
plt.show()

過濾方法 Filter Method

依關聯性移除特徵

# 取得具有與其他部分特徵高度相關的某特徵絕對值
cor_target = abs(cor["FamilySize"])

# 選擇高度相關的特徵(閾值 = 0.2)
relevant_features = cor_target[cor_target>0.2]

# 選擇特徵名稱
names = [index for index, value in relevant_features.iteritems()]

# 刪除目標特徵
names.remove('FamilySize')

print(names)
res = res.append(
pd.DataFrame(
use_RandomForestClassifier_evaluation_metrics_on_test_set(
X[names],
Y),
index=['Remove High Corr']))
res
# Correlation Matrix
plt.figure(figsize=(15,15))
cor = X[names].corr()
sns.heatmap(cor, annot=True,)
plt.show()

單變量特徵選取 Univariate Selection

def univariate_selection(X, Y, k=10):

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state = 9527)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# User SelectKBest to select top 10 features based on f-test
selector = SelectKBest(f_classif)
X_new = selector.fit_transform(X_train_scaled, Y_train)

feature_idx = selector.get_support()

feature_names = X.columns[feature_idx]

return feature_names
univariate_selection(X,Y)
res = res.append(
pd.DataFrame(
use_RandomForestClassifier_evaluation_metrics_on_test_set(
X[univariate_selection(X,Y)],
Y),
index=['Univariate Selection']))
res

包裝方法 Wrapper Method

遞迴特徵消除 Recursive feature elimination (RFE)

# Recursive Feature Elimination
def rfe_selection( X , Y, k=10):

# Split train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
X,
Y,
test_size = 0.2,
stratify=Y,
random_state = 9527)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(
criterion='entropy',
random_state=9527
)
rfe = RFE(model)
rfe = rfe.fit(X_train_scaled, Y_train)

feature_names = X.columns[rfe.get_support()]

return feature_names

rfe_selection(X,Y,10)
res = res.append(
pd.DataFrame(
use_RandomForestClassifier_evaluation_metrics_on_test_set(
X[rfe_selection(X,Y)],
Y),
index=['RFE']))

res

嵌入方法 Embedded Method

重要特徵 Feature importance

def feature_importance(X,Y):
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,stratify=Y, random_state = 9527)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier()
model = model.fit(X_train_scaled,Y_train)

plt.figure(figsize=(10, 12))
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.sort_values().plot(kind='barh')

plt.show()
return model


def select_features_from_model(model,X):

model = SelectFromModel(model, prefit=True, threshold=0.013)
feature_idx = model.get_support()
feature_names = X.columns[feature_idx]

return feature_names

model = feature_importance(X,Y)
feature_imp_feature_names = select_features_from_model(model,X)
feature_imp_feature_names
res = res.append(
pd.DataFrame(
use_RandomForestClassifier_evaluation_metrics_on_test_set(
X[feature_imp_feature_names],
Y),
index=['Feature Importance']))
res

L1正規化 L1 regularization

def run_l1_regularization(X,Y):

# Split train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,stratify=Y, random_state = 123)

# All features of dataset are float values. You normalize all features of the train and test dataset here.
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Select L1 regulated features from LinearSVC output
selection = SelectFromModel(LinearSVC(C=1, penalty='l1', dual=False))
selection.fit(X_train_scaled, Y_train)

feature_names = X.columns[(selection.get_support())]

return feature_names

l1reg_feature_names = run_l1_regularization(X,Y)
res = res.append(
pd.DataFrame(
use_RandomForestClassifier_evaluation_metrics_on_test_set(
X[run_l1_regularization(X,Y)],
Y),
index=['L1']))
res

評估小結

final_res = pd.DataFrame(use_RandomForestClassifier_evaluation_metrics_on_test_set(X,Y), index=['ALL'])
final_res = final_res.append(pd.DataFrame(use_RandomForestClassifier_evaluation_metrics_on_test_set(X[names],Y), index=['Remove High Corr']))
final_res = final_res.append(pd.DataFrame(use_RandomForestClassifier_evaluation_metrics_on_test_set(X[univariate_selection(X,Y)],Y), index=['Univariate Selection']))
final_res = final_res.append(pd.DataFrame(use_RandomForestClassifier_evaluation_metrics_on_test_set(X[rfe_selection(X,Y)],Y), index=['RFE']))
final_res = final_res.append(pd.DataFrame(use_RandomForestClassifier_evaluation_metrics_on_test_set(X[run_l1_regularization(X,Y)],Y), index=['L1']))
final_res.sort_values('accuracy', ascending=False)

參考