Skip to content

Class SmoothedEncoding

Methods

fit(self, dataframe, y=None, columns_to_encode=None, operations=None, weight_of_overall=0.3, payload=None)

Parameters:

Name Type Description Default
dataframe DataFrame

dataframe containing column values

required
y Union[NoneType, pandas.core.series.Series, pandas.core.frame.DataFrame]

target column to use for encoding value creation, None

None
columns_to_encode Union[NoneType, int, str, List[Union[str, int]]]

Column names to encode, by default None

None
operations Optional[List[Callable]]

encoding operation to perform, by default [np.mean]

None
weight_of_overall Union[float, int]

Categorical weight of importance, by default 0.3

0.3
payload dict

Alternate method to calculate values at a single go. The payload can be sent in the form of a dict as {'name of column to encode':{'name of column to encode over':['operation function one','operation function two']}}, by default None

None
Source code in nitrofe\encoding\encoding_features.py
def fit(
    self,
    dataframe: pd.DataFrame,
    y: Union[None, pd.Series, pd.DataFrame] = None,
    columns_to_encode: Union[None, int, str, List[Union[str, int]]] = None,
    operations: Union[None, List[Callable]] = None,
    weight_of_overall: Union[float, int] = 0.3,
    payload: dict = None,
):
    """
    Parameters
    ----------
    dataframe : pd.DataFrame
         dataframe containing column values
    y : Union[None, pd.Series, pd.DataFrame], optional
        target column to use for encoding value creation, None
    columns_to_encode : Union[None, int, str, List[Union[str, int]]], optional
        Column names to encode, by default None
    operations : Union[None, List[Callable]], optional
        encoding operation to perform, by default [np.mean]
    weight_of_overall : Union[float, int], optional
        Categorical weight of importance, by default 0.3
    payload : dict, optional
        Alternate method to calculate values at a single go.
        The payload can be sent in the form of a dict as
        {'name of column to encode':{'name of column to encode over':['operation function one','operation function two']}}, by default None
    """

    self.encoding_dict = {}
    self.payload = payload
    operations = [np.mean] if operations==None else operations
    self.weight_of_overall = weight_of_overall
    if y is None:
        y = pd.DataFrame()

    self._handle_concatenated_dataframe_column_names(y, dataframe)

    if self.payload is None:
        self._check_fit_columns_to_encode(columns_to_encode, dataframe)
        self._check_operations(operations)

        self.payload = {
            _col: {
                target_items: self.operations
                for target_items in self.target_columns
            }
            for _col in self.columns_to_encode
        }

    self._check_weight_of_overall()

    for _col in self.payload.keys():

        _col_frame = (
            self.concatenated_dataframe.groupby([_col])
            .agg(self.payload[_col])
            .multiply(
                self.concatenated_dataframe.groupby([_col])[_col].count(), axis=0
            )
            + self.weight_of_overall
            * np.concatenate(
                [
                    self.concatenated_dataframe.agg(
                        {x: self.payload[_col][x]}
                    ).values.flatten()
                    for x in payload[_col].keys()
                ]
            ).ravel()
        ).div(
            self.weight_of_overall
            + self.concatenated_dataframe.groupby([_col])[_col].count(),
            axis=0,
        )
        _col_frame.columns = [
            _col + "_groupby_" + "target_smoothed_" + lvlzero + "_"
            for lvlzero in _col_frame.columns.get_level_values(0)
        ] + _col_frame.columns.get_level_values(1)
        self.encoding_dict[_col] = _col_frame

    return self.encoding_dict