проблема, которую пытается решить: сжатие обучающих экземпляров путем агрегирования метки (среднее из взвешенных средних) и суммирования веса на основе одной и той же функции, при этом потери двоичного журнала остаются такими же, как потеря перекрестной энтропии. Вот пример и тестовые примеры log_loss показывают, что потеря двоичного журнала эквивалентна взвешенной потере журнала.
original data: compressed_data
feature, label, weight, prediction feature, label, weight, prediction
x1, 1, 1, 0.8 x1, 1/3, 3, 0.8
x1, 0, 2, 0.8 -->
x2, 1, 2, 0.1 x2, 2/3, 3, 0.1
x2, 0, 1, 0.1
x3, 1, 1, 0.9 x3, 1, 1, 0.9
проблема: потеря двоичного журнала не всегда эквивалентна потере перекрестной энтропии в lgbm, изменение производительности модели (например, потеря журнала, средняя точность и ROC_AUC) незначительное, но фактическое прогнозирование и распределение прогнозов весьма значительны. Эксперимент 1 показывает, что они эквивалентны в случае двоичной метки, в то время как эксперимент 2 показывает, что в некоторых случаях потеря двоичного журнала не совпадает с перекрестной энтропией (см. Примеры для более подробной информации).
сначала убедитесь, что потеря двоичного журнала такая же, как потеря кросс-энтропии по numpy
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
import lightgbm
# use X of cancer data as training feature for both experiment 1 and 2
X, _ = load_breast_cancer(return_X_y=True)
def logloss(y_true, y_pred, weight):
l = np.mean((-(y_true * np.log(y_pred))-((1-y_true)*np.log(1-y_pred)))*weight)
# normalize loss
l = l*y_true.shape[0]/weight.sum()
return l
"""
feature, label, weight, prediction feature, label, weight, prediction
x1, 1, 1/3, 0.7
x1, 1, 1/3, 0.7 --> x1, 2/3, 1, 0.7
x1, 0, 1/3, 0.7
"""
l1 = logloss(np.array([1,1,0]), np.array([0.7,0.7,.7]), np.array([1/3,1/3,1/3]))
l2 = logloss(np.array([2/3]), np.array([0.7]), np.array([1]))
"""
feature, label, weight, prediction feature, label, weight, prediction
x1, 1, 1, 0.8 x1, 1/3, 3, 0.8
x1, 0, 2, 0.8 -->
x2, 1, 2, 0.1 x2, 2/3, 3, 0.1
x2, 0, 1, 0.1
x3, 1, 1, 0.9 x3, 1, 1, 0.9
"""
l3 = logloss(np.array([1,0,1,0,1]),
np.array([0.8,0.8,0.1,0.1,0.9]),
np.array([1,2,2,1,1]))
l4 = logloss(np.array([1/3,2/3,1]), np.array([0.8,0.1,0.9]), np.array([3,3,1]))
np.testing.assert_almost_equal(l1, l2, decimal=4)
np.testing.assert_almost_equal(l3, l4, decimal=4)
эксперимент 1 (потеря двоичного журнала эквивалентна перекрестной потере энтропии в случае двоичной метки):
######## data for experiment 1
np.random.seed(42)
n = X.shape[0]
y_binary = np.random.randint(0,2,size=(n))
eps = 1e-2
y_float = np.random.uniform(eps,1-eps,size=(n))
lgbm_params = {
'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree':1,
'importance_type': 'split',
'learning_rate': 0.06472914709339864,
'max_depth': 46,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 20,
'n_jobs': 1,
'num_leaves': 178,
'random_state': 1574094090,
'reg_alpha': 0.4894283599023894,
'reg_lambda': 0.09743058458885945,
'silent': True,
'subsample':1,
# 'subsample_for_bin': 200000, # try larger values (10M+)
# 'subsample_freq': 252,
'min_data_in_bin':1,
'min_child_samples':1,
}
X_train_array, X_test_array, y_train_binary, y_test_binary, y_train_float, y_test_float = \
train_test_split(X, y_binary, y_float, test_size=0.3, random_state=1)
##### binary label case in sklearn API that binary objective is equivalent to cross_entropy objective
binary_model1 = LGBMClassifier(objective='binary')
binary_model1.set_params(**lgbm_params)
binary_model1.fit(
X_train_array,
y_train_binary,
sample_weight=np.ones(X_train_array.shape[0])
)
binary_model2 = LGBMRegressor(objective='cross_entropy')
binary_model2.set_params(**lgbm_params)
binary_model2.fit(
X_train_array,
y_train_binary,
sample_weight=np.ones(X_train_array.shape[0])
)
binary_pred_1 = binary_model1.predict_proba(X_test_array)[:,1]
binary_pred_2 = binary_model2.predict(X_test_array)
binary_y_pred_diff = binary_pred_1-binary_pred_2
# binary log loss and cross_entropy loss are same given binary labels
np.testing.assert_almost_equal(binary_pred_1, binary_pred_2, decimal=4)
эксперимент 2: потеря перекрестной энтропии может отличаться от потери журнала (не знаю почему)
######## data for experiment 2
def make_compressed_df(X, fixed_ratio=None):
"""
this function stimulates compressed data that instances with same feature will be deduped
and label becomes mean of these instance labels and weight becomes sum of these instance weight
ex.
args:
fixed_ratio: int or None, if int, raito of pos_count/neg_count is consistent (key of the experiment!)
original_data: compressed_data:
feature, label, weight feature, label, pos_count, neg_count, weight,
x1, 1, 1
x1, 1, 1 --> x1, 2/3, 2, 1, 3
x1, 0, 1
-------------------------------------------------
x2, 0, 1
x2, 1, 1 --> x2, 1/2, 1, 1, 2
-------------------------------------------------
x3, 1, 1
x3, 1, 1 --> x3, 2/2, 2, 0, 2
"""
compressed_df = pd.DataFrame(X)
pos_count = np.random.randint(1,3,size=(X.shape[0]))
compressed_df['pos_count'] = pos_count
if fixed_ratio:
compressed_df['neg_count'] = int(fixed_ratio)*compressed_df['pos_count']
else:
neg_count = np.random.randint(1,3,size=(X.shape[0]))
compressed_df['neg_count'] = neg_count
compressed_df['total_count'] = compressed_df['pos_count']+compressed_df['neg_count']
compressed_df['weight'] = compressed_df['pos_count']+compressed_df['neg_count']
compressed_df['label'] = compressed_df['pos_count']/compressed_df['total_count']
return compressed_df
def restore_data(df):
"""
restore original features, labels and weight based on pos_count and neg_count.
instances with same feature will repeat (pos_count+neg_count) times, labels will become
[1]*pos_count+[0]*neg_count, and weight becomes weight/(pos_count+neg_count)
ex.
compressed_data: original_data:
feature, label, pos_count, neg_count, weight feature, label, weight
x1, 1, 1
x1, 2/3, 2, 1, 3 --> x1, 1, 1
x1, 0, 1
-------------------------------------------------
x2, 0, 1
x2, 1/2, 1, 1, 2 --> x2, 1, 1
-------------------------------------------------
x3, 1, 1
x3, 2/2, 2, 0, 2 --> x3, 1, 1
"""
pos_df = df.loc[df.index.repeat(df['pos_count'])]
pos_df['label'] = 1
neg_df = df.loc[df.index.repeat(df['neg_count'])]
neg_df['label'] = 0
df = pd.concat([pos_df, neg_df], axis=0)
del pos_df, neg_df
df['weight'] = df['weight']/df['total_count']
df = df.drop(['pos_count', 'neg_count', 'total_count'], axis=1)
return df
def make_compressed_and_restored_data(X, fixed_ratio):
np.random.seed(42)
compressed_df = make_compressed_df(X, fixed_ratio)
compressed_train_df, compressed_test_df = train_test_split(
compressed_df, test_size=0.3, random_state=1)
restored_train_df = restore_data(compressed_train_df)
restored_test_df = restore_data(compressed_test_df)
return (compressed_train_df, compressed_test_df), (restored_train_df, restored_test_df)
# when ratio of pos_count/neg_count is not fixed, objectives are different
(compressed_train_random_ratio_df, compressed_test_df), \
(restored_train_random_ratio_df, restored_test_random_ratio_df) = \
make_compressed_and_restored_data(X, fixed_ratio=None)
model1 = LGBMClassifier(objective='binary')
model1.set_params(**lgbm_params)
model1.fit(
restored_train_random_ratio_df.iloc[:,:30],
restored_train_random_ratio_df['label'],
sample_weight=restored_train_random_ratio_df['weight']
)
model2 = LGBMRegressor(objective='cross_entropy')
model2.set_params(**lgbm_params)
model2.fit(
compressed_train_random_ratio_df.iloc[:,:30],
compressed_train_random_ratio_df['label'],
sample_weight=compressed_train_random_ratio_df['weight']
)
y1 = model1.predict_proba(compressed_test_df.iloc[:,:30])[:,1]
y2 = model2.predict(compressed_test_df.iloc[:,:30])
# this assertion fails
np.testing.assert_almost_equal(y1, y2, decimal=4)
# when ratio of pos_count/neg_count is fixed, objectives are same
(compressed_train_fixed_ratio_df, compressed_test_fixed_ratio_df), \
(restored_train_fixed_ratio_df, restored_test_fixed_ratio_df) = \
make_compressed_and_restored_data(X, fixed_ratio=2)
model3 = LGBMClassifier(objective='binary')
model3.set_params(**lgbm_params)
model3.fit(
restored_train_fixed_ratio_df.iloc[:,:30],
restored_train_fixed_ratio_df['label'],
sample_weight=restored_train_fixed_ratio_df['weight']
)
model4 = LGBMRegressor(objective='cross_entropy')
model4.set_params(**lgbm_params)
model4.fit(
compressed_train_fixed_ratio_df.iloc[:,:30],
compressed_train_fixed_ratio_df['label'],
sample_weight=compressed_train_fixed_ratio_df['weight']
)
y3 = model3.predict_proba(compressed_test_fixed_ratio_df.iloc[:,:30])[:,1]
y4 = model4.predict(compressed_test_fixed_ratio_df.iloc[:,:30])
# this assertion passes
np.testing.assert_almost_equal(y3, y4, decimal=4)