u++の備忘録

言語処理100本ノック 2020「59. ハイパーパラメータの探索」

問題文

nlp100.github.io

問題の概要

学習アルゴリズムとして「RandomForestClassifier()」も利用し「max_depth」の値を調整します。

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


X_train = pd.read_table('ch06/train.feature.txt', header=None)
X_valid = pd.read_table('ch06/valid.feature.txt', header=None)
X_test = pd.read_table('ch06/test.feature.txt', header=None)
y_train = pd.read_table('ch06/train.txt', header=None)[1]
y_valid = pd.read_table('ch06/valid.txt', header=None)[1]
y_test = pd.read_table('ch06/test.txt', header=None)[1]

test_acc = []

C_candidate = [0.1, 1.0, 10, 100]
for c in C_candidate:
    clf = LogisticRegression(penalty='l2', solver='sag', random_state=0, C=c)
    clf.fit(X_train, y_train)
    test_acc.append(accuracy_score(y_test, clf.predict(X_test)))


max_depth_candidate = [2, 4, 8, 16]
for m in max_depth_candidate:
    clf = RandomForestClassifier(max_depth=m, random_state=0)
    clf.fit(X_train, y_train)
    test_acc.append(accuracy_score(y_test, clf.predict(X_test)))

bestIndex = test_acc.index(max(test_acc))
if bestIndex < 4:
    bestAlg = 'LogisticRegression'
    bestParam = f'C={C_candidate[bestIndex]}'
else:
    bestAlg = 'RandomForestClassifier'
    bestParam = f'max_depth={max_depth_candidate[bestIndex - 4]}'

print(bestAlg, bestParam)