Source code for geochemistrypi.data_mining.data.preprocessing
# -*- coding: utf-8 -*-
from typing import List
import numpy as np
import pandas as pd
from rich import print
from sklearn.feature_selection import GenericUnivariateSelect, SelectKBest, f_classif, f_regression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from .data_readiness import show_data_columns
[docs]
def feature_scaler(X: pd.DataFrame, method: List[str], method_idx: int) -> tuple[dict, np.ndarray]:
"""Apply feature scaling methods.
Parameters
----------
X : pd.DataFrame
The dataset.
method : str
The feature scaling methods.
method_idx : int
The index of methods.
Returns
-------
feature_scaling_config : dict
The feature scaling configuration.
X_scaled : np.ndarray
The dataset after imputing.
"""
if method[method_idx] == "Min-max Scaling":
scaler = MinMaxScaler()
elif method[method_idx] == "Standardization":
scaler = StandardScaler()
try:
X_scaled = scaler.fit_transform(X)
except ValueError:
print("The selected feature scaling method is not applicable to the dataset!")
print("Please check the dataset to find the reason.")
feature_scaling_config = {type(scaler).__name__: scaler.get_params()}
return feature_scaling_config, X_scaled
[docs]
def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: int, method: List[str], method_idx: int) -> tuple[dict, pd.DataFrame]:
"""Apply feature selection methods.
Parameters
----------
X : pd.DataFrame
The feature dataset.
y : pd.DataFrame
The label dataset.
feature_selection_task : int
Feature selection for regression or classification tasks.
method : str
The feature selection methods.
method_idx : int
The index of methods.
Returns
-------
feature_selection_config : dict
The feature selection configuration.
X_selected : pd.DataFrame
The feature dataset after selecting.
"""
print("--Original Features-")
show_data_columns(X.columns)
features_num = len(X.columns)
print(f"The original number of features is {features_num}, and your input must be less than {features_num}.")
features_retain_num = int(input("Please enter the number of features to retain.\n" "@input: "))
if feature_selection_task == 1:
score_func = f_regression
elif feature_selection_task == 2:
score_func = f_classif
if method[method_idx] == "GenericUnivariateSelect":
selector = GenericUnivariateSelect(score_func=score_func, mode="k_best", param=features_retain_num)
elif method[method_idx] == "SelectKBest":
selector = SelectKBest(score_func=score_func, k=features_retain_num)
try:
selector.fit(X, y)
features_selected = selector.get_feature_names_out()
X = X[features_selected]
except ValueError:
print("The selected feature selection method is not applicable to the dataset!")
print("Please check the dataset to find the reason.")
feature_selection_config = {type(selector).__name__: selector.get_params()}
return feature_selection_config, X