技術/廣告 文章分類器(二)

PeasantWorker 2022-01-08 01:10:13 阅读数:300

文章


前言

本文基於上一篇博客技術/廣告 文章分類器(一),作出了一些優化,將准確率由84.5%提昇至94.4%


一、優化手段

1、增加訓練數據

之前的訓練數據集,兩類數據分別只有500條左右,訓練數據太少。
本文所使用數據集為45000餘條,增加了90倍,應該完全够用

2、更改分類模型

之前使用多項式樸素貝葉斯,效果一般,由於使用了樣本屬性獨立性的假設,且樣本是文本,樣本屬性是分詞後各個單詞對應的詞向量,文本間是有上下文關系的,所以,樣本屬性是不獨立的,樣本屬性有關聯,樣本屬性獨立性假設不成立,導致最終效果不理想。因此,改用集成學習模型,且達到了一個較好的效果

3、分詞時加入用戶詞典

一些關鍵的詞,並沒有被理想分詞出來,與不加入用戶詞典相比,准確率提高了1%左右

4、去除停用詞及特殊符號

在分詞之前,去除了錶情及一些特殊符號,嘗試過在分詞之後再去除特殊符號,結果證明在分詞之前去除特殊符號,效果更好,去除特殊符號後,准確率提昇2%左右

二、TFIDF + AdaBoost

全部代碼

class TrainBlogClsTfidfAdaBoost:
def __init__(self):
jieba.load_userdict(get_blog_cls_jieba_user_dict_path())
self.train_data_dir = get_blog_cls_train_data_optimize_dir()
self.tfidf_path = get_tfidf_path()
self.model_path = get_adaboost_model_path()
# self.train_data_dir = get_blog_cls_train_data_dev_dir()
# self.tfidf_path = get_test_tfidf_path()
# self.model_path = get_adaboost_test_model_path()
def load(self):
if not os.path.exists(self.model_path):
logger.warning("開始訓練,目標模型數據:", self.model_path)
self.train()
logger.info("加載模型")
self.model = joblib.load(self.model_path)
self.tf_idf = joblib.load(self.tfidf_path)
def load_data(self):
'''加載文件內容和標簽'''
files = get_files_path(self.train_data_dir, '.txt')
contents = []
labels = []
for file in files:
with open(file, 'r') as f:
data = f.read()
data = filter_content_for_blog_cls(data)
data_cut = ' '.join(jieba.cut(data))
contents.append(data_cut)
label = file.split('/')[-2]
labels.append(label)
X_train, X_test, y_train, y_test = train_test_split(contents,
labels,
test_size=0.2,
random_state=123456)
return X_train, X_test, y_train, y_test
def load_stopwords(self):
path = './data/pro/datasets/stopwords/cn_stopwords.txt'
with open(path, 'r') as f:
stopwords = f.read().split('\n')
return stopwords
def train(self):
logger.info('開始訓練...')
stopwords = self.load_stopwords()
X_train, X_test, y_train, y_test = self.load_data()
tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.5)
train_data = tfidf.fit(X_train)
train_data = tfidf.transform(X_train)
test_data = tfidf.transform(X_test)
joblib.dump(tfidf, self.tfidf_path, compress=1)
model = AdaBoostClassifier() # 99%
model.fit(train_data, y_train)
predict_test = model.predict(test_data)
joblib.dump(model, self.model_path, compress=1)
print("准確率為:", metrics.accuracy_score(predict_test, y_test))
def predict(self, test_data):
test_data = filter_content_for_blog_cls(test_data)
test_data = ' '.join(jieba.cut(test_data))
test_vec = self.tf_idf.transform([test_data])
res = self.model.predict(test_vec)
return res
def test_acc(self):
data_path = './data/pro/datasets/blogs/blog_adver_cls/test_dev.csv'
data = pd.read_csv(data_path)
data = data.dropna(axis=0)
test_text = data['content']
text_list = []
for text in test_text:
text = filter_content_for_blog_cls(text)
text = ' '.join(jieba.cut(text))
text_list.append(text)
label = data['label']
test_data = self.tf_idf.transform(text_list)
predict_test = self.model.predict(test_data)
print("在測試集准確率為:", metrics.accuracy_score(predict_test, label))

結果:

在測試集准確率為: 0.9646315789473684

測試數據大概5000條,這個數量級,還是比較有說服力的

三、Fasttext

之前有用過fasttext來做圖書分類,見「fasttext文本分類」,在三分類上准確率達到93%,在35個類別上准確率為75.6%,總體效果還不錯,於是想到用fasttext來試下,看看效果是否會更好些。

全部代碼

import os
import fasttext
import jieba
import logging
import random
from tqdm import tqdm
import pandas as pd
from sklearn import metrics
from common.utils import get_files_path
from common.utils import filter_content_for_blog_cls
from common.path.dataset.blog import get_blog_cls_jieba_user_dict_path
from common.path.dataset.blog import get_blog_cls_train_data_dev_dir, get_fasttext_train_data_path
from common.path.model.blog import get_blog_cls_fasttext_model_path
logger = logging.getLogger(__name__)
class TrainBlogClsFasttext:
def __init__(self):
jieba.load_userdict(get_blog_cls_jieba_user_dict_path())
self.train_data_dev_dir = get_blog_cls_train_data_dev_dir()
self.train_data_path = get_fasttext_train_data_path()
self.fasttext_model_path = get_blog_cls_fasttext_model_path()
self.class_name_mapping = {

'__label__0': 'technology',
'__label__1': 'advertisement'
}
def load(self):
if not os.path.exists(self.fasttext_model_path):
logger.info('開始訓練模型...')
self.train_fasttext()
logger.info("加載模型")
self.model = fasttext.load_model(self.fasttext_model_path)
def data_process(self):
data_dir = self.train_data_dev_dir
files = get_files_path(data_dir, '.txt')
if not os.path.exists(self.train_data_path):
os.mkdir(self.train_data_path)
random.shuffle(files)
fasttext_train_data_path = os.path.join(self.train_data_path, 'train.txt')
fasttext_test_data_path = os.path.join(self.train_data_path, 'test.txt')
if os.path.exists(fasttext_train_data_path) and os.path.exists(fasttext_test_data_path):
return
lines_train = []
lines_test = []
all_data = []
for file in tqdm(files, desc='正在構建訓練數據: '):
with open(file, 'r') as f:
data = f.read()
data = filter_content_for_blog_cls(data)
data = ' '.join(jieba.cut(data))
if file.find('technology') != -1:
label = '__label__{}'.format(0)
elif file.find('advertisement') != -1:
label = '__label__{}'.format(1)
else:
print("錯誤的數據:{}".format(file))
line = data + '\t' + label + '\n'
all_data.append(line)
lines_train = all_data[:int(len(all_data)*0.8)]
lines_test = all_data[int(len(all_data)*0.8):]
with open(fasttext_train_data_path, 'a') as f:
f.writelines(lines_train)
with open(fasttext_test_data_path, 'a') as f:
f.writelines(lines_test)
def load_stopwords(self):
path = './data/pro/datasets/stopwords/cn_stopwords.txt'
with open(path, 'r') as f:
stopwords = f.read().split('\n')
return stopwords
def train_fasttext(self):
self.data_process()
data_dir = self.train_data_path
train_path = os.path.join(data_dir, 'train.txt')
test_path = os.path.join(data_dir, 'test.txt')
classifier = fasttext.train_supervised(input=train_path,
label="__label__",
dim=100,
epoch=10,
lr=0.1,
wordNgrams=2,
loss='softmax',
thread=8,
verbose=True)
classifier.save_model(self.fasttext_model_path)
result = classifier.test(test_path)
logger.info('Train Result:'.format(result))
logger.info('F1 Score: {}'.format(result[1] * result[2] * 2 /
(result[2] + result[1])))
def predict(self, text):
test_data = filter_content_for_blog_cls(text)
test_data = ' '.join(jieba.cut(test_data))
result = self.model.predict(test_data)
class_name = result[0][0]
res_label = self.class_name_mapping[class_name]
return res_label
def test_acc(self):
data_path = './data/pro/datasets/blogs/blog_adver_cls/test_dev.csv'
data = pd.read_csv(data_path)
data = data.dropna(axis=0)
test_text = data['content']
text_list = []
for text in test_text:
text = filter_content_for_blog_cls(text)
text = ' '.join(jieba.cut(text))
text_list.append(text)
labels = data['label']
res_labels = []
for text in text_list:
label = self.model.predict(text)
class_name = label[0][0]
res_label = self.class_name_mapping[class_name]
res_labels.append(res_label)
print("在測試集准確率為:", metrics.accuracy_score(res_labels, labels))

代碼沒什麼難的,主要就是數據處理,這裏也是在分詞之前去除了特殊符號,這樣做效果確實有提昇,可以自己嘗試下。

直接看效果吧:

[INFO][2022-01-03 14:39:23][fasttext_classifier.py:33 at load]: 開始訓練模型...
Read 5M words
Number of words: 261664
Number of labels: 2
Progress: 100.0% words/sec/thread: 1415852 lr: 0.000000 avg.loss: 0.059132 ETA: 0h 0m 0s
[INFO][2022-01-03 14:39:33][fasttext_classifier.py:101 at train_fasttext]: Train Result:
[INFO][2022-01-03 14:39:33][fasttext_classifier.py:102 at train_fasttext]: F1 Score: 0.9638259736027375
[INFO][2022-01-03 14:39:33][fasttext_classifier.py:35 at load]: 加載模型
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
在測試集准確率為: 0.9661052631578947

在同一份測試數據集上,Fasttext准確率高了0.2%,但模型大小為912M,使用TFIDF + AdaBoost 訓練出來的模型加起來也就4.9M。

實際推理速度還未測試過,因此目前使用的是占用內存更小的 TFIDF + AdaBoost。

總結

多觀察數據,理解數據特征,對提昇模型效果有莫大的幫助。

事實證明:

1、增加用戶詞典可以提昇准確率
2、去除文本中的特殊字符可以提昇准確率

相關文章:

1、技術/廣告 文章分類器(一)
2、fasttext文本分類

版权声明:本文为[PeasantWorker]所创,转载请带上原文链接,感谢。 https://gsmany.com/2022/01/202201080110133453.html