50. データの入手・整形
News Aggregator Data Setをダウンロードし、以下の要領で学習データ(train.txt),検証データ(valid.txt),評価データ(test.txt)を作成せよ.
1. ダウンロードしたzipファイルを解凍し,readme.txtの説明を読む.
2. 情報源(publisher)が”Reuters”, “Huffington Post”, “Businessweek”, “Contactmusic.com”, “Daily Mail”の事例(記事)のみを抽出する.
3. 抽出された事例をランダムに並び替える.
4. 抽出された事例の80%を学習データ,残りの10%ずつを検証データと評価データに分割し,それぞれtrain.txt,valid.txt,test.txtというファイル名で保存する.ファイルには,1行に1事例を書き出すこととし,カテゴリ名と記事見出しのタブ区切り形式とせよ(このファイルは後に問題70で再利用する).学習データと評価データを作成したら,各カテゴリの事例数を確認せよ.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
import random news_lst = [] # csvファイルの読み込み, 整形 with open('./NewsAggregatorDataset/newsCorpora.csv', 'r') as f: for line in f: data = line.split('\t') if (data[3] in ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']): news_lst.append(data) # 出力 random.shuffle(news_lst) train_len = int(len(news_lst)*0.8) valid_len = test_len = int(len(news_lst)*0.1) with open('./train.txt', 'w') as f1, open('./valid.txt', 'w') as f2, open('./test.txt', 'w') as f3: for data in news_lst[:train_len]: f1.write('\t'.join([data[1], data[4]])+'\n') for data in news_lst[train_len:train_len+valid_len]: f2.write('\t'.join([data[1], data[4]])+'\n') for data in news_lst[train_len+valid_len:]: f3.write('\t'.join([data[1], data[4]])+'\n') # 確認 type_lst = ['train', 'valid', 'test'] for i, lst in enumerate([news_lst[:train_len], news_lst[train_len:train_len+valid_len], news_lst[train_len+valid_len:]]): print( type_lst[i], len(list((filter(lambda x: x[4] == 'b', lst)))), len(list((filter(lambda x: x[4] == 't', lst)))), len(list((filter(lambda x: x[4] == 'e', lst)))), len(list((filter(lambda x: x[4] == 'm', lst)))) ) |
1 2 3 4 5 |
train 4483 1244 4226 731 valid 567 135 543 90 test 577 146 525 89 |
51. 特徴量抽出
学習データ,検証データ,評価データから特徴量を抽出し,それぞれtrain.feature.txt,valid.feature.txt,test.feature.txtというファイル名で保存せよ. なお,カテゴリ分類に有用そうな特徴量は各自で自由に設計せよ.記事の見出しを単語列に変換したものが最低限のベースラインとなるであろう.
1 2 3 4 5 6 7 8 9 10 11 |
import re import spacy nlp = spacy.load('en') with open('./train.txt', 'r') as f: train_data = [] for text in f: text = text.split('\t')[0] train_data.append(nlp(text)) |
1 2 3 4 5 6 7 8 |
# 語彙設定 import collections c = collections.Counter([i.lemma_.lower() for j in train_data for i in j]) v = [w for w, f in c.most_common() if (2 < f < 1000)] |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
with open('./train.txt', 'r') as f1, open('./valid.txt', 'r') as f2, open('./test.txt', 'r') as f3: train_data = [] for text in f1: text = text.split('\t') train_data.append([nlp(text[0]), text[1][0]]) valid_data = [] for text in f2: text = text.split('\t') valid_data.append([nlp(text[0]), text[1][0]]) test_data = [] for text in f3: text = text.split('\t') test_data.append([nlp(text[0]), text[1][0]]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# 特徴量抽出 import pickle def get_feature(sentence, vocab): feature = {t: 0 for t in vocab} for w in sentence: l = w.lemma_.lower() if l in v: feature[l] += 1 return feature def write_features(data, file): features = [] for d in data: features.append({'words': get_feature(d[0], v), 'cat': d[1]}) pickle.dump(features, file) with open('./train.feature.txt', 'wb') as f1, open('./valid.feature.txt', 'wb') as f2, open('./test.feature.txt', 'wb') as f3: write_features(train_data, f1) write_features(valid_data, f2) write_features(test_data, f3) |
1 2 3 4 5 6 7 8 9 10 |
# 確認 import pickle with open('./train.feature.txt', 'rb') as f1: data = pickle.load(f1) print(data[0]) print(len(data)) |
1 2 3 |
{'words': {'and': 0, 'us': 0, 'a': 0, 'with': 0, 'at': 0, 'after': 0, 'new': 0, 'say': 0, '"': 0, '$': 0, '(': 0, ')': 0, 'up': 0, 'not': 0, 'from': 0, 'stock': 0, 'by': 0, '?': 0, 'kardashian': 0, 'have': 0, 'rise': 0, ';': 0, 'will': 0, 'high': 0, 'kim': 0, 'china': 0, 'over': 0, 'euro': 1, '1': 0, 'show': 0, 'first': 0, 'low': 0, 'may': 0, 'more': 0, 'fall': 0, 'share': 0, 'deal': 0, 'that': 0, 'year': 0, 'about': 0, 'get': 0, '!': 0, 'dollar': 0, 'do': 0, 'star': 0, 'sale': 0, 'bank': 0, 'see': 0, 'make': 0, 'day': 0, 'out': 0, '2': 0, 'fed': 0, 'ecb': 0, 'profit': 0, 'take': 0, 'off': 0, 'buy': 0, 'billion': 0, 'global': 0, 'rate': 0, 'miley': 0, 'cyrus': 0, 'drop': 0, 'forex': 1, 'wall': 0, 'time': 0, 'datum': 0, 'ceo': 0, 'but': 0, 'report': 0, 'set': 0, 'west': 0, 'could': 0, 'cut': 0, 'hit': 0, 'gain': 0, 'growth': 0, 'pay': 0, 'reveal': 0, 'video': 0, 'chris': 0, 'price': 0, 'ukraine': 0, 'big': 0, 'market': 0, |
52. 学習
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
from sklearn.linear_model import LogisticRegression import pickle with open('./train.feature.txt', 'rb') as f1: data = pickle.load(f1) train_x, train_t = [], [] for feature in data: train_x.append([v for k, v in feature['words'].items()]) train_t.append(feature['cat']) lr = LogisticRegression() lr.fit(train_x, train_t) |
53. 予測
1 2 3 4 5 6 7 8 9 10 11 |
with open('./test.feature.txt', 'rb') as f1: data = pickle.load(f1) test_x, test_t = [], [] for feature in data: test_x.append([v for k, v in feature['words'].items()]) test_t.append(feature['cat']) out = lr.predict(test_x) |
1 2 3 4 5 |
out = lr.predict(test_x) for i in range(min(len(out), 50)): print(out[i], test_t[i]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
b b e e e e b b e e b b b b b b b b b b b b e e e e b b b b b b m m b b b t b b e e b b e e b b e e b b e e b b b b b b e e e e b b e e m m m b t t t t e e b b e e e e e e b b t t m m e e e m e e b b |
54. 正解率の計測
1 2 3 4 5 6 7 8 9 |
# 訓練データ out = lr.predict(train_x) print (len([1 for i in range(len(out)) if out[i] == train_t[i]]) / len(out)) out = lr.predict(test_x) print (len([1 for i in range(len(out)) if out[i] == test_t[i]]) / len(out)) |
1 2 3 4 |
0.9747285660801198 0.9050112191473448 |
55. 混同行列の作成
52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を,学習データおよび評価データ上で作成せよ.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
from sklearn.metrics import confusion_matrix import seaborn as sns with open('./valid.feature.txt', 'rb') as f1: data = pickle.load(f1) valid_x, valid_t = [], [] for feature in data: valid_x.append([v for k, v in feature['words'].items()]) valid_t.append(feature['cat']) # 学習データ cm = confusion_matrix(train_t, lr.predict(train_x), labels=['b', 't', 'e', 'm']) print(cm) #評価データ cm = confusion_matrix(valid_t, lr.predict(valid_x), labels=['b', 't', 'e', 'm']) print(cm) sns.heatmap(cm) |
1 2 3 4 5 6 7 8 9 10 |
[[4413 41 26 3] [ 85 1122 36 1] [ 23 1 4200 2] [ 29 3 20 679]] [[542 8 16 1] [ 26 93 15 1] [ 10 2 528 3] [ 11 4 22 53]] |
56. 適合率,再現率,F1スコアの計測
1 2 3 4 5 6 7 8 |
# 自動 from sklearn.metrics import classification_report report = classification_report(valid_t, lr.predict(valid_x), labels=['b', 't', 'e', 'm']) print(report) |
1 2 3 4 5 6 7 8 9 10 |
precision recall f1-score support b 0.92 0.96 0.94 567 t 0.87 0.69 0.77 135 e 0.91 0.97 0.94 543 m 0.91 0.59 0.72 90 avg / total 0.91 0.91 0.91 1335 |
多クラス分類におけるTrue Positiveとかの式は以下のようになります。
True Positive: \(tp_i = c_{ii}\)
False Positive: \(fp_i = \sum_{n=1}^{l}c_{ni} – tp_{i}$\)
False Negative: \(fn_i = \sum_{n=1}^{l}c_{in} – tp_{i}\)
True Negative: \(tn_i = \sum_{n=1}^{l}\sum_{k=1}^{l}c_{nk} – tp_{i} – fp_{i} – fn_{i}\)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import numpy as np def calc_report(metrics, labels): index = [] length = len(labels) for i in range(length): tp = metrics[i, i] fp = sum(cm[:,i]) - tp fn = sum(cm[i,:]) - tp tn = np.sum(cm) - tp- fp- fn precision = tp/(tp+fp) recall = tp/(tp+fn) f1_measure = 2*precision*recall/(precision+recall) index.append({ 'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn, 'precision': round(precision, 2), 'recall': round(recall, 2), 'f1-measure': round(f1_measure, 2) }) micro_precision = sum([i['tp'] for i in index])/sum([i['tp']+i['fp'] for i in index]) micro_recall = sum([i['tp'] for i in index])/sum([i['tp']+i['fn'] for i in index]) micro_f1 = 2*micro_precision*micro_recall/(micro_precision+micro_recall) macro_precision = sum([i['tp']/(i['tp']+i['fp']) for i in index])/length macro_recall = sum([i['tp']/(i['tp']+i['fn']) for i in index])/length macro_f1 = 2*macro_precision*macro_recall/(macro_precision+macro_recall) index.append({'micro-p': round(micro_precision, 2), 'micro_recall': round(micro_recall, 2), 'micro-f1': round(micro_f1, 2)}) index.append({'macro-p': round(macro_precision, 2), 'macro_recall': round(macro_recall, 2), 'macro-f1': round(macro_f1, 2)}) return index labels = ['b', 't', 'e', 'm'] cm = confusion_matrix(valid_t, lr.predict(valid_x), labels=labels) for i in calc_report(cm, labels): print(i) |
1 2 3 4 5 6 7 8 |
{'tp': 542, 'fp': 47, 'fn': 25, 'tn': 721, 'precision': 0.92, 'recall': 0.96, 'f1-measure': 0.94} {'tp': 93, 'fp': 14, 'fn': 42, 'tn': 1186, 'precision': 0.87, 'recall': 0.69, 'f1-measure': 0.77} {'tp': 528, 'fp': 53, 'fn': 15, 'tn': 739, 'precision': 0.91, 'recall': 0.97, 'f1-measure': 0.94} {'tp': 53, 'fp': 5, 'fn': 37, 'tn': 1240, 'precision': 0.91, 'recall': 0.59, 'f1-measure': 0.72} {'micro-p': 0.91, 'micro_recall': 0.91, 'micro-f1': 0.91} {'macro-p': 0.9, 'macro_recall': 0.8, 'macro-f1': 0.85} |
57. 特徴量の重みの確認
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
import pandas as pd features_list = [k for k, v in feature['words'].items()] for i in range(len(labels)): print('-----'+labels[i]+'-----') indices = lr.coef_[i].argsort()[:10] display(pd.DataFrame([ map(lambda x: features_list[x], list(indices)), lr.coef_[i][indices] ], index = ['特徴量', '重み'], columns = list(map(lambda x: str(x)+'位', range(1, 10+1))))) indices = lr.coef_[i].argsort()[::-1][:10] display(pd.DataFrame([ map(lambda x: features_list[x], list(indices)), lr.coef_[i][indices] ], index = ['特徴量', '重み'], columns = list(map(lambda x: str(x)+'位', range(1, 10+1))))) |
58. 正則化パラメータの変更
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
from sklearn.linear_model import LogisticRegression import pickle from tqdm import tqdm def getdata(file): x, t = [], [] for feature in pickle.load(file) : x.append([v for k, v in feature['words'].items()]) t.append(feature['cat']) return x, t with open('./train.feature.txt', 'rb') as f1, open('./valid.feature.txt', 'rb') as f2, open('./test.feature.txt', 'rb') as f3: # データの取得 train_x, train_t = getdata(f1) valid_x, valid_t = getdata(f2) test_x, test_t = getdata(f3) train_acc, valid_acc, test_acc = [], [], [] x_axis = np.arange(0.1, 5.1, 0.1) for C in tqdm(x_axis): # 学習 lr = LogisticRegression(penalty='l2', C=C) lr.fit(train_x, train_t) # 評価 out = lr.predict(train_x) train_acc.append(len([1 for i in range(len(out)) if out[i] == train_t[i]]) / len(out)) out = lr.predict(valid_x) valid_acc.append(len([1 for i in range(len(out)) if out[i] == valid_t[i]]) / len(out)) out = lr.predict(test_x) test_acc.append(len([1 for i in range(len(out)) if out[i] == test_t[i]]) / len(out)) print(train_acc, len(train_acc)) print(valid_acc, len(valid_acc)) print(test_acc, len(test_acc)) |
1 2 3 4 5 6 7 8 9 |
import matplotlib.pyplot as plt plt.plot(x_axis, train_acc, label = 'Train') plt.plot(x_axis, valid_acc, label = 'Valid') plt.plot(x_axis, test_acc, label = 'Test') plt.legend() plt.show() |
59. ハイパーパラメータの探索
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
with open('./train.feature.txt', 'rb') as f1, open('./valid.feature.txt', 'rb') as f2: # データの取得 train_x, train_t = getdata(f1) valid_x, valid_t = getdata(f2) # 学習 lr = LogisticRegression(penalty='l2', C=2.4) lr.fit(train_x, train_t) # 評価 out = lr.predict(valid_x) print(len([1 for i in range(len(out)) if out[i] == valid_t[i]]) / len(out)) |
1 2 3 |
0.9168539325842696 |