import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
# 示例数据(假设已经有了标注好的意图数据)
data = {
'text': ["订一张去北京的机票",
"给我看看最新的智能手机型号",
"明天的天气怎么样?",
"你觉的天气怎么样?",
"python怎么样?",
"告诉我关于Python编程的信息"],
'intent': ["flight_booking",
"product_information",
"weather_information",
"weather_information",
"software_information",
"software_information"]
}
df = pd.DataFrame(data)
# 分词函数(使用jieba分词)
def chinese_tokenizer(text):
return list(jieba.cut(text))
# 划分训练集和测试集
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['intent'], test_size=0.2, random_state=42)
# 定义TF-IDF向量化器和分类器(这里使用逻辑回归作为分类器),注意不指定token_pattern参数
tfidf_vectorizer = TfidfVectorizer(tokenizer=chinese_tokenizer)
classifier = LogisticRegression(max_iter=1000)
# 构建管道
model = Pipeline([
('tfidf', tfidf_vectorizer),
('clf', classifier),
])
# 训练模型
model.fit(train_texts, train_labels)
# 在测试集上评估模型
predictions = model.predict(test_texts)
print("Accuracy:", accuracy_score(test_labels, predictions))
print("\nClassification Report:")
print(classification_report(test_labels, predictions))
# 准备测试数据
test_data = ["预定一间去上海的酒店",
"我如何学习机器学习?",
"今天北京的温度是多少?"]
# 预测测试数据的意图
predictions = model.predict(test_data)
# 输出预测结果
for text, intent in zip(test_data, predictions):
print(f"Text: {text} | Predicted Intent: {intent}")