https://pastein.ru/t/eA
скопируйте уникальную ссылку для отправки
Загрузка данных
from pathlib import Path
import pandas as pd
from pmlb import classification_dataset_names, fetch_data, regression_dataset_names
from pmlb.write_metadata import imbalance_metrics
from benchmark.benchmark_model_types import BenchmarkModelTypesEnum
from benchmark.benchmark_utils import \
(convert_json_stats_to_csv, get_models_hyperparameters,
get_penn_case_data_paths, save_metrics_result_file)
from benchmark.executor import CaseExecutor, ExecutionParams
from core.repository.tasks import TaskTypesEnum, Task
from core.models.data import InputData
from core.models.model import Model
from sklearn.metrics import mean_squared_error as mse, roc_auc_score as roc_auc, r2_score
from datetime import timedelta
def _problem_and_metric_for_dataset(name_of_dataset: str, num_classes: int):
if num_classes == 2 and name_of_dataset in classification_dataset_names:
return TaskTypesEnum.classification, ['roc_auc', 'f1']
elif name_of_dataset in regression_dataset_names:
return TaskTypesEnum.regression, ['mse', 'r2']
else:
return None, None
def is_dataset_big(name_of_dataset):
dataset = fetch_data(name_of_dataset)
if len(dataset.index) > 10000 and dataset.size > 200000:
return True
def pmlb_dataset(name_of_dataset, task):
train_file_path, test_file_path = get_penn_case_data_paths(name_of_dataset)
train_data = InputData.from_csv(train_file_path, task=Task(task))
test_data = InputData.from_csv(test_file_path, task=Task(task))
return train_data, test_data
if __name__ == '__main__':
penn_data = Path('./datasets.csv')
dataset = []
if penn_data.is_file():
df = pd.read_csv(penn_data)
dataset = df['dataset_names'].values
else:
print('Please create nonempty csv-file with datasets')
# if len(dataset) == 0:
# dataset = [data for data in regression_dataset_names if is_dataset_big(data)]
for name_of_dataset in dataset:
pmlb_data = fetch_data(name_of_dataset)
num_classes, _ = imbalance_metrics(pmlb_data['target'].tolist())
problem_class, metric_names = _problem_and_metric_for_dataset(name_of_dataset, num_classes)
if not problem_class or not metric_names:
print(f'Incorrect dataset: {name_of_dataset}')
continue
# train_file, test_file = get_penn_case_data_paths(name_of_dataset)
train_data, test_data = pmlb_dataset(name_of_dataset, problem_class)
# config_models_data = get_models_hyperparameters()
case_name = f'penn_ml_{name_of_dataset}'
dct = {}
try:
xgb = Model(model_type='xgbreg')
model, _ = xgb.fit(data=train_data)
test_predicted = xgb.predict(fitted_model=model, data=test_data)
mse_on_test = r2_score(y_true=test_data.target,
y_pred=test_predicted)
dct['before'] = mse_on_test
xgb_for_tune = Model(model_type='xgbreg')
model, _ = xgb_for_tune.fine_tune(data=train_data, iterations=10,
max_lead_time=timedelta(minutes=1))
test_predicted_tuned = xgb_for_tune.predict(fitted_model=model, data=test_data)
mse_on_test_tuned = r2_score(y_true=test_data.target,
y_pred=test_predicted_tuned)
dct['after'] = mse_on_test_tuned
except Exception as ex:
print(f'Exception on {name_of_dataset}: {ex}')
continue
# result_metrics['hyperparameters'] = config_models_data
# save_metrics_result_file(result_metrics, file_name=f'penn_ml_metrics_for_{name_of_dataset}')
save_metrics_result_file(dct, file_name=f'penn_ml_metrics_for_{name_of_dataset}')
# convert_json_stats_to_csv(dataset)
# if __name__ == '__main__':
# penn_data = Path('./datasets.csv')
# dataset = []
# if penn_data.is_file():
# df = pd.read_csv(penn_data)
# dataset = df['dataset_names'].values
# else:
# print('Please create nonempty csv-file with datasets')
#
# if len(dataset) == 0:
# dataset = classification_dataset_names + regression_dataset_names
#
# for name_of_dataset in dataset:
# pmlb_data = fetch_data(name_of_dataset)
# num_classes, _ = imbalance_metrics(pmlb_data['target'].tolist())
# problem_class, metric_names = _problem_and_metric_for_dataset(name_of_dataset, num_classes)
# if not problem_class or not metric_names:
# print(f'Incorrect dataset: {name_of_dataset}')
# continue
#
# train_file, test_file = get_penn_case_data_paths(name_of_dataset)
# config_models_data = get_models_hyperparameters()
# case_name = f'penn_ml_{name_of_dataset}'
#
# try:
# result_metrics = CaseExecutor(params=ExecutionParams(train_file=train_file,
# test_file=test_file,
# task=problem_class,
# target_name='target',
# case_label=case_name),
# # models=[BenchmarkModelTypesEnum.tpot,
# # BenchmarkModelTypesEnum.baseline,
# # BenchmarkModelTypesEnum.fedot],
# models=[BenchmarkModelTypesEnum.fedot],
# metric_list=metric_names).execute()
# except Exception as ex:
# print(f'Exception on {name_of_dataset}: {ex}')
# continue
#
# result_metrics['hyperparameters'] = config_models_data
#
# save_metrics_result_file(result_metrics, file_name=f'penn_ml_metrics_for_{name_of_dataset}')
#
# convert_json_stats_to_csv(dataset)