我来帮你实现一个本地运行的零样本文本分类工具,无需网络连接,这里基于BERT-like模型实现OpenClaw的核心思想。

基于transformers的本地零样本分类
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import numpy as np
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')
class LocalOpenClaw:
"""
本地运行的零样本文本分类器
基于预训练语言模型,无需网络连接
"""
def __init__(self, model_name: str = "bert-base-uncased"):
"""
初始化模型
默认使用bert-base-uncased,也可以使用其他本地模型
"""
print(f"加载模型: {model_name}")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
self.model.eval()
def create_hypothesis(self, text: str, label: str) -> str:
"""
创建假设文本(零样本分类的核心)
可以根据任务调整模板
"""
templates = [
"This text is about {}.",
"The topic of this text is {}.",
"This is related to {}.",
"The main subject is {}."
]
import random
template = random.choice(templates)
return template.format(label)
def predict(self,
text: str,
candidate_labels: List[str],
multi_label: bool = False,
hypothesis_template: str = None) -> Dict:
"""
零样本分类预测
参数:
- text: 要分类的文本
- candidate_labels: 候选标签列表
- multi_label: 是否多标签分类
- hypothesis_template: 自定义假设模板
返回:
- 预测结果字典
"""
results = []
with torch.no_grad():
for label in candidate_labels:
# 构建假设文本
if hypothesis_template:
hypothesis = hypothesis_template.format(label)
else:
hypothesis = self.create_hypothesis(text, label)
# 编码文本和假设
inputs = self.tokenizer(
text,
hypothesis,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512
)
# 前向传播
outputs = self.model(**inputs)
logits = outputs.logits
# 获取概率(使用softmax)
probs = torch.softmax(logits, dim=-1)
# 假设的得分(使用"蕴含"的概率)
# 对于大多数模型,索引0是矛盾,1是中性,2是蕴含
# 这里简化处理,使用最后一个logit作为得分
score = probs[0, -1].item()
results.append({
"label": label,
"score": score
})
# 排序结果
results.sort(key=lambda x: x["score"], reverse=True)
if not multi_label:
# 单标签:返回最高分的标签
best_result = results[0]
return {
"sequence": text,
"labels": [r["label"] for r in results],
"scores": [r["score"] for r in results],
"prediction": best_result["label"],
"confidence": best_result["score"]
}
else:
# 多标签:返回所有超过阈值的标签
threshold = 0.5 # 可调整的阈值
predictions = [r["label"] for r in results if r["score"] > threshold]
return {
"sequence": text,
"labels": [r["label"] for r in results],
"scores": [r["score"] for r in results],
"predictions": predictions
}
def batch_predict(self,
texts: List[str],
candidate_labels: List[str]) -> List[Dict]:
"""批量预测"""
return [self.predict(text, candidate_labels) for text in texts]
if __name__ == "__main__":
# 初始化分类器(首次运行会下载模型,之后使用本地缓存)
classifier = LocalOpenClaw()
# 测试文本
text = "The stock market reached new highs today after positive economic indicators were released."
# 候选标签
candidate_labels = ["finance", "sports", "politics", "technology", "entertainment"]
# 预测
result = classifier.predict(text, candidate_labels)
print("文本:", text)
print("\n预测结果:")
print(f" 预测类别: {result['prediction']}")
print(f" 置信度: {result['confidence']:.3f}")
print("\n所有候选标签得分:")
for label, score in zip(result["labels"], result["scores"]):
print(f" {label}: {score:.3f}")
使用更轻量级的模型(推荐)
from sentence_transformers import SentenceTransformer, util
import numpy as np
class LightweightOpenClaw:
"""
轻量级零样本分类器
使用sentence-transformers,模型更小更快
"""
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
"""
初始化
model_name可以是:
- 'all-MiniLM-L6-v2' (推荐,22MB)
- 'paraphrase-MiniLM-L3-v2' (更小)
- 'all-mpnet-base-v2' (更大但更准确)
"""
print(f"加载轻量模型: {model_name}")
self.model = SentenceTransformer(model_name)
def predict(self,
text: str,
candidate_labels: List[str],
multi_label: bool = False,
threshold: float = 0.5) -> Dict:
"""
基于语义相似度的零样本分类
原理:
1. 将文本和候选标签编码为向量
2. 计算余弦相似度
3. 相似度最高的作为预测结果
"""
# 准备标签描述(可以自定义)
label_descriptions = [f"This text is about {label}." for label in candidate_labels]
# 编码文本和标签
text_embedding = self.model.encode(text, convert_to_tensor=True)
label_embeddings = self.model.encode(label_descriptions, convert_to_tensor=True)
# 计算相似度
cos_scores = util.cos_sim(text_embedding, label_embeddings)[0]
# 转换为numpy
scores = cos_scores.cpu().numpy()
# 获取排序
sorted_indices = np.argsort(scores)[::-1]
results = []
for idx in sorted_indices:
results.append({
"label": candidate_labels[idx],
"score": float(scores[idx])
})
if not multi_label:
# 单标签分类
best_result = results[0]
return {
"sequence": text,
"labels": [r["label"] for r in results],
"scores": [r["score"] for r in results],
"prediction": best_result["label"],
"confidence": best_result["score"]
}
else:
# 多标签分类
predictions = [r["label"] for r in results if r["score"] > threshold]
return {
"sequence": text,
"labels": [r["label"] for r in results],
"scores": [r["score"] for r in results],
"predictions": predictions
}
if __name__ == "__main__":
# 创建分类器
classifier = LightweightOpenClaw()
# 测试数据
texts = [
"Apple unveiled its new iPhone with advanced camera features.",
"The football team won the championship after an intense game.",
"The government announced new economic policies to combat inflation."
]
labels = ["technology", "sports", "politics", "finance", "entertainment"]
print("零样本文本分类示例:\n")
for i, text in enumerate(texts, 1):
result = classifier.predict(text, labels)
print(f"文本{i}: {text}")
print(f"预测: {result['prediction']} (置信度: {result['confidence']:.3f})")
print("-" * 50)
简单实用的版本(无深度学习)
import re
from collections import Counter
import math
class SimpleOpenClaw:
"""
基于关键词匹配的简单零样本分类器
完全无依赖,适合基础应用
"""
def __init__(self):
# 预定义的关键词词典(可以根据需要扩展)
self.keyword_dict = {
"technology": ["computer", "software", "hardware", "phone", "internet",
"digital", "tech", "app", "program", "code", "ai", "robot"],
"sports": ["game", "team", "player", "score", "win", "lose",
"championship", "football", "basketball", "sport", "athlete"],
"finance": ["stock", "market", "money", "bank", "investment",
"price", "economy", "financial", "trade", "currency"],
"politics": ["government", "president", "policy", "election", "law",
"political", "minister", "vote", "democracy", "party"],
"health": ["medical", "doctor", "hospital", "disease", "health",
"medicine", "patient", "treatment", "virus", "vaccine"]
}
def preprocess(self, text: str) -> List[str]:
"""文本预处理"""
# 转换为小写,去除标点
text = text.lower()
text = re.sub(r'[^\w\s]', ' ', text)
words = text.split()
return words
def predict(self, text: str, candidate_labels: List[str]) -> Dict:
"""基于关键词匹配的预测"""
words = self.preprocess(text)
word_counts = Counter(words)
scores = {}
for label in candidate_labels:
score = 0
if label in self.keyword_dict:
keywords = self.keyword_dict[label]
for keyword in keywords:
if keyword in word_counts:
# 使用TF-IDF简化版
tf = word_counts[keyword] / len(words) if len(words) > 0 else 0
score += tf * math.log(1 + 1/len(keywords)) # 简化IDF
scores[label] = score
# 归一化
total = sum(scores.values())
if total > 0:
scores = {k: v/total for k, v in scores.items()}
# 排序
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return {
"sequence": text,
"prediction": sorted_scores[0][0] if sorted_scores else "unknown",
"confidence": sorted_scores[0][1] if sorted_scores else 0,
"scores": dict(sorted_scores)
}
def add_keywords(self, label: str, keywords: List[str]):
"""添加自定义关键词"""
if label not in self.keyword_dict:
self.keyword_dict[label] = []
self.keyword_dict[label].extend(keywords)
if __name__ == "__main__":
classifier = SimpleOpenClaw()
# 测试
text = "The new smartphone has amazing features and fast processor"
result = classifier.predict(text, ["technology", "sports", "finance"])
print(f"文本: {text}")
print(f"预测: {result['prediction']}")
print(f"置信度: {result['confidence']:.3f}")
print("\n详细得分:")
for label, score in result['scores'].items():
print(f" {label}: {score:.3f}")
安装和运行说明
对于方案一和方案二:
# 安装依赖 pip install torch transformers sentence-transformers # 运行 python openclaw_local.py
对于方案三:
# 无需安装额外依赖 python simple_openclaw.py
特点:
- 完全离线运行:所有模型都从本地加载
- 零样本学习:不需要训练数据
- 灵活可扩展:支持自定义标签和模板
- 多种选择:提供三个不同复杂度的方案
建议:
- 如果需要高准确度,使用方案一
- 如果需要平衡速度和准确度,使用方案二
- 如果完全不想安装深度学习库,使用方案三
你希望我详细解释哪个方案的原理,或者需要针对特定任务进行修改吗?
标签: 请输入需要生成关键词的具体内容 “深度学习 图像识别”
版权声明:除非特别标注,否则均为本站原创文章,转载时请以链接形式注明文章出处。