u++の備忘録

言語処理100本ノック 2020「80. ID番号への変換」

python 自然言語処理

問題文

nlp100.github.io

問題の概要

指示通りに愚直に実装します。

from collections import defaultdict

import joblib
import pandas as pd


def text2id(text):
    return [word2token[word] for word in text.split()]


X_train = pd.read_table('ch06/train.txt', header=None)
use_cols = ['TITLE', 'CATEGORY']
X_train.columns = use_cols

d = defaultdict(int)
for sentence in X_train['TITLE']:
    for word in sentence.split():
        d[word] += 1
dc = sorted(d.items(), key=lambda x: x[1], reverse=True)

words = []
idx = []
for i, a in enumerate(dc, 1):
    words.append(a[0])
    if a[1] < 2:
        idx.append(0)
    else:
        idx.append(i)

word2token = dict(zip(words, idx))
print(X_train['TITLE'].apply(text2id))