forked from brainome/automl-benchmarks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
open_ml_brainome_wrapper.py
135 lines (114 loc) · 4.57 KB
/
open_ml_brainome_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Brainome Daimensions(tm)
#
# The Brainome Table Compiler(tm)
# Copyright (c) 2022 Brainome Incorporated. All Rights Reserved.
# GPLv3 license, all text above must be included in any redistribution.
# See LICENSE.TXT for more information.
#
# This program may use Brainome's servers for cloud computing. Server use
# is subject to separate license agreement.
#
# Contact: [email protected]
# for questions and suggestions.
#
# @author: [email protected]
# @author: [email protected]
import logging
import os
import shutil
import csv
import json
import importlib.util
import sys
import argparse
import time
from open_ml_experiment import get_target, get_valid_test_id
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
MODEL_TYPES = ['RF', 'NN', 'DT', 'SVM']
SPLITS_DIR = 'TRAIN_TEST_SPLITS'
BTC_TIMES = {}
def prepare_run_directory(model_type):
if os.path.exists('target'):
shutil.rmtree('target')
predictor_dir = f"btc-runs/{model_type}"
if os.path.exists(predictor_dir):
shutil.rmtree(predictor_dir)
os.makedirs(predictor_dir)
def split(test_params, data_dir):
source_file = test_params.get('TRAIN_FILE')
csv_name = source_file.split(os.sep)[-1].replace('.csv', '')
train_data = f"{SPLITS_DIR}/{csv_name}-clean-train.csv"
test_data = f"{SPLITS_DIR}/{csv_name}-clean-test-targetless.csv"
test_data_with_labels = f"{SPLITS_DIR}/{csv_name}-clean-test.csv"
if not all([os.path.exists(file) for file in [train_data, test_data, test_data_with_labels]]):
target = get_target(test_params, data_dir)
cmd = f'python3 helpers/split_data.py {data_dir}/{source_file} {SPLITS_DIR} -target {target}'
print(f'splitting: {cmd}')
os.system(cmd)
else:
print(f'{source_file} data already split')
def make_predictor(test_params, model_type, data_dir, multiclass=False):
test_id = test_params.get('TEST_ID').replace('-', '_')
csv_name = test_params.get('TRAIN_FILE').split(os.sep)[-1].replace('.csv', '')
train_file = f"TRAIN_TEST_SPLITS/{csv_name}-clean-train.csv"
cmd = f"brainome {train_file} -f {model_type} -y -split 70 -modelonly -q -o btc-runs/{model_type}/{test_id}.py -json btc-runs/{model_type}/{test_id}.json"
if model_type == 'DT':
cmd += ' -rank'
if model_type == 'RF' and multiclass:
cmd += ' -e 5'
start = time.time()
print(f'making predictor: {cmd}')
os.system(cmd)
end = time.time()
train_time = end - start
raw_test_id = test_params.get('TEST_ID')
if raw_test_id not in BTC_TIMES:
BTC_TIMES[raw_test_id] = {}
BTC_TIMES[raw_test_id][model_type] = train_time
return train_time
def run_predictor(test_params, model_type):
test_id = test_params.get('TEST_ID').replace('-', '_')
csv_name = test_params.get('TRAIN_FILE').split(os.sep)[-1].replace('.csv', '')
path_to_data = f'TRAIN_TEST_SPLITS/{csv_name}-clean-test.csv'
cmd = f'python3 helpers/run_brainome_predictor.py {model_type} {test_id} {csv_name}'
json_output = json.loads(os.popen(cmd).read().strip())
print(f'result from running predictor: {json_output}')
return json_output
def update_model_results(test_params, model_type, new_results):
test_id = test_params.get('TEST_ID').replace('-', '_')
path_to_json = f'btc-runs/{model_type}/{test_id}.json'
old_json = json.load(open(path_to_json))
new_json = {**old_json, **new_results}
with open(path_to_json, 'w+') as f:
print(json.dumps(new_json), file=f)
def main(suite, data_dir):
assert os.path.exists(suite)
try:
for model_type in MODEL_TYPES:
print(f'Working on {model_type}.')
prepare_run_directory(model_type)
with open(suite) as f:
reader = csv.DictReader(f, delimiter='\t')
for idx, test_params in enumerate(reader):
try:
if test_params.get('IGNORE'):
continue
split(test_params, data_dir)
train_time = make_predictor(test_params, model_type, data_dir, multiclass=('multiclass' in suite))
new_results = run_predictor(test_params, model_type)
# add test information to json output
update_model_results(test_params, model_type, new_results)
except ValueError as my_error:
logger.error(f"test scenario fail {test_params['TEST_ID']}", exc_info=my_error)
finally:
with open('btc-runs/btc-times.json', 'w+') as json_out:
print(json.dumps(BTC_TIMES), file=json_out)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('suite', type=str, nargs='?', default="test-suites/open_ml_select.tsv", help="tsv file in test-suites")
parser.add_argument('data_dir', type=str, nargs='?', default="./data", help="data/ directory")
args = parser.parse_args()
if not os.path.exists(SPLITS_DIR):
os.mkdir(SPLITS_DIR)
main(args.suite, args.data_dir)