потеря перекрестной энтропии не эквивалентна потере двоичного журнала в lgbm

проблема, которую пытается решить: сжатие обучающих экземпляров путем агрегирования метки (среднее из взвешенных средних) и суммирования веса на основе одной и той же функции, при этом потери двоичного журнала остаются такими же, как потеря перекрестной энтропии. Вот пример и тестовые примеры log_loss показывают, что потеря двоичного журнала эквивалентна взвешенной потере журнала.

original data:                                compressed_data

feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1,        0.8                 x1,      1/3,     3,       0.8
    x1,   0,     2,        0.8        -->            
    x2,   1,     2,        0.1                 x2,      2/3,     3,       0.1
    x2,   0,     1,        0.1
    x3,   1,     1,        0.9                 x3,      1,       1,       0.9

проблема: потеря двоичного журнала не всегда эквивалентна потере перекрестной энтропии в lgbm, изменение производительности модели (например, потеря журнала, средняя точность и ROC_AUC) незначительное, но фактическое прогнозирование и распределение прогнозов весьма значительны. Эксперимент 1 показывает, что они эквивалентны в случае двоичной метки, в то время как эксперимент 2 показывает, что в некоторых случаях потеря двоичного журнала не совпадает с перекрестной энтропией (см. Примеры для более подробной информации).

сначала убедитесь, что потеря двоичного журнала такая же, как потеря кросс-энтропии по numpy

import numpy as np
import pandas as pd 
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
import lightgbm


# use X of cancer data as training feature for both experiment 1 and 2 
X, _ = load_breast_cancer(return_X_y=True)


def logloss(y_true, y_pred, weight):    
    l = np.mean((-(y_true * np.log(y_pred))-((1-y_true)*np.log(1-y_pred)))*weight)
    # normalize loss
    l = l*y_true.shape[0]/weight.sum()
    return l

"""
feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1/3,       0.7                
    x1,   1,     1/3,       0.7        -->       x1,    2/3,      1,       0.7      
    x1,   0,     1/3,       0.7
"""

l1 = logloss(np.array([1,1,0]), np.array([0.7,0.7,.7]), np.array([1/3,1/3,1/3]))
l2 = logloss(np.array([2/3]), np.array([0.7]), np.array([1]))

"""
feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1,        0.8                 x1,      1/3,     3,       0.8
    x1,   0,     2,        0.8        -->            
    x2,   1,     2,        0.1                 x2,      2/3,     3,       0.1
    x2,   0,     1,        0.1
    x3,   1,     1,        0.9                 x3,      1,       1,       0.9
"""
l3 = logloss(np.array([1,0,1,0,1]), 
             np.array([0.8,0.8,0.1,0.1,0.9]), 
             np.array([1,2,2,1,1]))
l4 = logloss(np.array([1/3,2/3,1]), np.array([0.8,0.1,0.9]), np.array([3,3,1]))

np.testing.assert_almost_equal(l1, l2, decimal=4)
np.testing.assert_almost_equal(l3, l4, decimal=4) 

эксперимент 1 (потеря двоичного журнала эквивалентна перекрестной потере энтропии в случае двоичной метки):

######## data for experiment 1
np.random.seed(42)
n = X.shape[0]
y_binary = np.random.randint(0,2,size=(n))
eps = 1e-2
y_float = np.random.uniform(eps,1-eps,size=(n))

lgbm_params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree':1,
    'importance_type': 'split',
    'learning_rate': 0.06472914709339864,
    'max_depth': 46,
    'min_child_weight': 0.001, 
    'min_split_gain': 0.0,
    'n_estimators': 20,
    'n_jobs': 1,
    'num_leaves': 178,
    'random_state': 1574094090,
    'reg_alpha': 0.4894283599023894,
    'reg_lambda': 0.09743058458885945,
    'silent': True,
    'subsample':1,
#     'subsample_for_bin': 200000, # try larger values (10M+)
#     'subsample_freq': 252,
    'min_data_in_bin':1,
    'min_child_samples':1,    
 }

X_train_array, X_test_array, y_train_binary, y_test_binary, y_train_float, y_test_float = \
    train_test_split(X, y_binary, y_float, test_size=0.3, random_state=1)

##### binary label case in sklearn API that binary objective is equivalent to cross_entropy objective
binary_model1 = LGBMClassifier(objective='binary')
binary_model1.set_params(**lgbm_params)

binary_model1.fit(
    X_train_array, 
    y_train_binary, 
    sample_weight=np.ones(X_train_array.shape[0])
)

binary_model2 = LGBMRegressor(objective='cross_entropy')
binary_model2.set_params(**lgbm_params)
binary_model2.fit(
    X_train_array, 
    y_train_binary, 
    sample_weight=np.ones(X_train_array.shape[0])
)

binary_pred_1 = binary_model1.predict_proba(X_test_array)[:,1]
binary_pred_2 = binary_model2.predict(X_test_array)
binary_y_pred_diff = binary_pred_1-binary_pred_2

# binary log loss and cross_entropy loss are same given binary labels
np.testing.assert_almost_equal(binary_pred_1, binary_pred_2, decimal=4)

эксперимент 2: потеря перекрестной энтропии может отличаться от потери журнала (не знаю почему)

######## data for experiment 2 

def make_compressed_df(X, fixed_ratio=None): 
    """
    this function stimulates compressed data that instances with same feature will be deduped
    and label becomes mean of these instance labels and weight becomes sum of these instance weight
    ex.
    
    args:
        fixed_ratio: int or None, if int, raito of pos_count/neg_count is consistent (key of the experiment!)
    
    original_data:                  compressed_data: 
    
    feature, label, weight            feature, label, pos_count, neg_count, weight, 
        x1,   1,     1                   
        x1,   1,     1        -->       x1,    2/3,       2,         1,       3
        x1,   0,     1
        -------------------------------------------------
        x2,   0,     1                   
        x2,   1,     1        -->       x2,    1/2,       1,         1,       2
        -------------------------------------------------   
        x3,   1,     1                   
        x3,   1,     1        -->       x3,    2/2,       2,         0,       2
        
    """
    compressed_df = pd.DataFrame(X)
    pos_count = np.random.randint(1,3,size=(X.shape[0]))
    compressed_df['pos_count'] = pos_count 
    if fixed_ratio:
        compressed_df['neg_count'] = int(fixed_ratio)*compressed_df['pos_count']
    else:
        neg_count = np.random.randint(1,3,size=(X.shape[0]))
        compressed_df['neg_count'] = neg_count 
        
    compressed_df['total_count'] = compressed_df['pos_count']+compressed_df['neg_count']

    compressed_df['weight'] = compressed_df['pos_count']+compressed_df['neg_count']
    compressed_df['label'] = compressed_df['pos_count']/compressed_df['total_count']
    
    return compressed_df


def restore_data(df):
    """
    restore original features, labels and weight based on pos_count and neg_count.
    instances with same feature will repeat (pos_count+neg_count) times, labels will become
    [1]*pos_count+[0]*neg_count, and weight becomes weight/(pos_count+neg_count)
    
    ex.
    
        compressed_data:                                     original_data: 
    
        feature, label, pos_count, neg_count, weight         feature, label, weight
                                                                x1,    1,     1
        x1,    2/3,       2,         1,       3        -->      x1,    1,     1
                                                                x1,    0,     1
        -------------------------------------------------
                                                                x2,    0,     1
        x2,    1/2,       1,         1,       2        -->      x2,    1,     1
        -------------------------------------------------                 
                                                                x3,    1,     1
        x3,    2/2,       2,         0,       2        -->      x3,    1,     1
        
        
    """     
    pos_df = df.loc[df.index.repeat(df['pos_count'])]
    pos_df['label'] = 1
    
    neg_df = df.loc[df.index.repeat(df['neg_count'])]
    neg_df['label'] = 0
    
    df = pd.concat([pos_df, neg_df], axis=0)
    del pos_df, neg_df
    df['weight'] = df['weight']/df['total_count']
    df = df.drop(['pos_count', 'neg_count', 'total_count'], axis=1)    
    return df


def make_compressed_and_restored_data(X, fixed_ratio):
    np.random.seed(42)
    compressed_df = make_compressed_df(X, fixed_ratio)
    compressed_train_df, compressed_test_df = train_test_split(
        compressed_df, test_size=0.3, random_state=1)
    
    restored_train_df = restore_data(compressed_train_df)
    restored_test_df = restore_data(compressed_test_df)
    
    return (compressed_train_df, compressed_test_df), (restored_train_df, restored_test_df)


# when ratio of pos_count/neg_count is not fixed, objectives are different
(compressed_train_random_ratio_df, compressed_test_df), \
    (restored_train_random_ratio_df, restored_test_random_ratio_df) = \
    make_compressed_and_restored_data(X, fixed_ratio=None)

model1 = LGBMClassifier(objective='binary')
model1.set_params(**lgbm_params)

model1.fit(
    restored_train_random_ratio_df.iloc[:,:30], 
    restored_train_random_ratio_df['label'], 
    sample_weight=restored_train_random_ratio_df['weight']
)

model2 = LGBMRegressor(objective='cross_entropy')
model2.set_params(**lgbm_params)
model2.fit(
    compressed_train_random_ratio_df.iloc[:,:30], 
    compressed_train_random_ratio_df['label'], 
    sample_weight=compressed_train_random_ratio_df['weight']
)

y1 = model1.predict_proba(compressed_test_df.iloc[:,:30])[:,1]
y2 = model2.predict(compressed_test_df.iloc[:,:30])
# this assertion fails
np.testing.assert_almost_equal(y1, y2, decimal=4)


# when ratio of pos_count/neg_count is  fixed, objectives are same
(compressed_train_fixed_ratio_df, compressed_test_fixed_ratio_df), \
    (restored_train_fixed_ratio_df, restored_test_fixed_ratio_df) = \
    make_compressed_and_restored_data(X, fixed_ratio=2)

model3 = LGBMClassifier(objective='binary')
model3.set_params(**lgbm_params)

model3.fit(
    restored_train_fixed_ratio_df.iloc[:,:30], 
    restored_train_fixed_ratio_df['label'], 
    sample_weight=restored_train_fixed_ratio_df['weight']
)

model4 = LGBMRegressor(objective='cross_entropy')
model4.set_params(**lgbm_params)
model4.fit(
    compressed_train_fixed_ratio_df.iloc[:,:30], 
    compressed_train_fixed_ratio_df['label'], 
    sample_weight=compressed_train_fixed_ratio_df['weight']
)

y3 = model3.predict_proba(compressed_test_fixed_ratio_df.iloc[:,:30])[:,1]
y4 = model4.predict(compressed_test_fixed_ratio_df.iloc[:,:30])
# this assertion passes
np.testing.assert_almost_equal(y3, y4, decimal=4)

person torchflow    schedule 17.11.2020    source источник


Ответы (1)


Похоже, этот вопрос был размещен здесь и в официальном репозитории LightGBM.

Сопровождающие LightGBM предоставили там ответ: https://github.com/microsoft/LightGBM/issues/3576.

person James Lamb    schedule 18.11.2020
comment
вы правы, это тот, который я разместил на github, но еще не решен. - person torchflow; 19.11.2020