ccprocessor · darkrush · Jun 19, 2025 · May 22, 2025 · May 22, 2025
diff --git a/docs/llm_web_kit/model/lang_id.md b/docs/llm_web_kit/model/lang_id.md
@@ -4,7 +4,7 @@
 
 is_218e为True时使用lid218e模型，在多个小语种中有更好的表现，除个别容易使模型混淆的情况外，会返回正常的language_details字段，若该参数为False，则language_details字段为空，默认值为True
 
-is_cn_specific为True时，会对文本中的中文文本进行细分，分为zho-Hans(简体中文)或zho-Hant(繁体中文)，结果在language_details字段中，默认值为False
+is_cn_specific为True时，会对文本中的中文文本进行细分，分为zho-Hans(简体中文)或zho-Hant(繁体中文),结果在language_details字段中，默认值为False,如果需要使用，请先pip install langdetect_zh==1.0.4,该package使用langdetect的方法,并针对中文进行了特调,能有效识别简体中文和繁体中文
 
 ## 配置文件需要改动的部分
 
@@ -120,3 +120,89 @@ print(update_language_by_str(text, is_cn_specific=True))
 总时间: 1.3538 秒
 
 处理速度: 443.91 条/秒
+
+## 性能说明
+
+测试集使用gsarti/flores_101，该数据集包含102种语言的并行句子，每个语种2009条测试集路径：https://huggingface.co/datasets/gsarti/flores_101
+
+下表所示lid176为单模型结果，模型路径为s3://web-parse-huawei/shared_resource/language/lid176.bin
+
+lid218e也为单模型结果，模型路径为s3://web-parse-huawei/shared_resource/language/lid218e.bin
+
+级联方案即为该代码调用方案，使用lid176判断zh, en, ja, ko，使用lid218e判断其他语种，使用langdetect_zh区分简体中文与繁体中文
+
+该表统计了三种模型在102种语言上错误的次数，其中lid176繁体中文全错是考虑到该模型无法区分简体中文和繁体中文
+
+| 级联方案  |          | lid176   |          | lid218e   |          |
+| --------- | -------- | -------- | -------- | --------- | -------- |
+| 真实语言  | 错误次数 | 真实语言 | 错误次数 | 真实语言  | 错误次数 |
+| bos       | 1079     | zho_trad | 2009     | bos       | 1079     |
+| kam       | 767      | ful      | 2009     | kam       | 765      |
+| zho_trad  | 18       | lug      | 2009     | zho_trad  | 623      |
+| hrv       | 197      | hau      | 2009     | zho_simpl | 229      |
+| nya       | 165      | ibo      | 2009     | hrv       | 197      |
+| kea       | 145      | kea      | 2009     | nya       | 161      |
+| msa       | 145      | kam      | 2009     | kea       | 145      |
+| ful       | 67       | lin      | 2009     | msa       | 145      |
+| xho       | 51       | luo      | 2009     | ful       | 56       |
+| umb       | 46       | mri      | 2009     | umb       | 46       |
+| zul       | 38       | nso      | 2009     | jpn       | 46       |
+| fas       | 38       | nya      | 2009     | fas       | 38       |
+| ind       | 37       | orm      | 2009     | ind       | 37       |
+| mri       | 27       | sna      | 2009     | xho       | 32       |
+| wol       | 22       | umb      | 2009     | zul       | 16       |
+| ast       | 16       | wol      | 2009     | ast       | 16       |
+| dan       | 13       | xho      | 2009     | dan       | 13       |
+| nob       | 13       | zul      | 2009     | wol       | 13       |
+| nso       | 12       | bos      | 1879     | nob       | 13       |
+| luo       | 11       | ast      | 1373     | nso       | 11       |
+| lug       | 11       | som      | 1184     | luo       | 8        |
+| jav       | 9        | msa      | 1131     | lug       | 7        |
+| sna       | 9        | yor      | 943      | pus       | 7        |
+| ibo       | 8        | oci      | 753      | glg       | 6        |
+| afr       | 7        | hrv      | 609      | jav       | 6        |
+| pus       | 7        | jav      | 590      | mri       | 5        |
+| glg       | 6        | afr      | 574      | hin       | 4        |
+| som       | 4        | glg      | 294      | swe       | 3        |
+| swh       | 4        | uzb      | 188      | yor       | 3        |
+| hin       | 4        | ltz      | 151      | lin       | 3        |
+| yor       | 4        | ceb      | 144      | lao       | 2        |
+| lin       | 4        | nob      | 137      | oci       | 2        |
+| ceb       | 3        | swh      | 118      | som       | 2        |
+| swe       | 3        | mlt      | 109      | ceb       | 2        |
+| lao       | 2        | dan      | 90       | khm       | 2        |
+| oci       | 2        | slv      | 56       | slv       | 1        |
+| uzb       | 2        | ind      | 48       | uzb       | 1        |
+| orm       | 2        | slk      | 41       | npi       | 1        |
+| nld       | 2        | pus      | 37       | tgl       | 1        |
+| hau       | 2        | gle      | 26       | bul       | 1        |
+| slv       | 1        | npi      | 18       | fra       | 1        |
+| zho_simpl | 1        | azj      | 17       | hau       | 1        |
+| npi       | 1        | asm      | 14       | ita       | 1        |
+| eng       | 1        | tgk      | 13       | ltz       | 1        |
+| tgl       | 1        | isl      | 13       | kaz       | 1        |
+| est       | 1        | est      | 12       | por       | 1        |
+| bul       | 1        | snd      | 12       | afr       | 1        |
+| fra       | 1        | cym      | 11       | spa       | 1        |
+| ita       | 1        | cat      | 11       |           |          |
+| khm       | 1        | srp      | 11       |           |          |
+| ltz       | 1        | kir      | 10       |           |          |
+| kaz       | 1        | nld      | 5        |           |          |
+| por       | 1        | por      | 5        |           |          |
+| spa       | 1        | swe      | 4        |           |          |
+|           |          | mkd      | 4        |           |          |
+|           |          | lav      | 4        |           |          |
+|           |          | urd      | 3        |           |          |
+|           |          | tgl      | 3        |           |          |
+|           |          | kaz      | 2        |           |          |
+|           |          | ron      | 2        |           |          |
+|           |          | ita      | 2        |           |          |
+|           |          | bel      | 2        |           |          |
+|           |          | bul      | 2        |           |          |
+|           |          | lit      | 2        |           |          |
+|           |          | lao      | 1        |           |          |
+|           |          | ckb      | 1        |           |          |
+
+根据统计表格，lid176准确率0.7715，lid218e准确率为0.9817，级联方案准确率为0.9853，准确率公式为：1-sum(错误次数)/(102\*2009)
+
+级联方案相比于lid176提升了多语种的准确率，同时也解决了lid218e针对部分语种（中文简体、中文繁体、日语）的错误
diff --git a/docs/llm_web_kit/model/politics_detector.md b/docs/llm_web_kit/model/politics_detector.md
@@ -1,8 +1,8 @@
 ## 作用
 
-识别中文或英文文本中的涉政内容，目前包含了新旧两类接口，旧的接口接收单条数据，并返回该数据的涉政分数，分数接近1代表不涉政，分数接近0则代表涉政。目前旧的接口仅支持CPU模型。
+识别中文或英文文本中的涉政内容，目前包含了新旧两类接口，25m3_cpu模型接口接收单条数据，并返回该数据的涉政分数，分数接近1代表不涉政，分数接近0则代表涉政。目前25m3_cpu模型接口仅支持CPU模型。
 
-新的接口检测结果以ModelResponse类返回，该类包含is_remained和details两个字段，其中is_remained代表数据是否需要保留，details则是一个包含涉政分数等详细信息的字典。新的接口支持CPU和GPU两种模型。
+25m3模型接口检测结果以ModelResponse类返回，该类包含is_remained和details两个字段，其中is_remained代表数据是否需要保留，details则是一个包含涉政分数等详细信息的字典。25m3模型接口支持GPU模型。
 
 ## 配置文件需要改动的部分
 
@@ -13,20 +13,20 @@
         "common":{
             "cache_path": "~/.llm_web_kit_cache"
         },
-        "political-24m7":{
-            "download_path": "s3://web-parse-huawei/shared_resource/political/24m7.zip",
-            "md5": "97eabb56268a3af3f68e8a96a50d5f80",
-        },
         "political-25m3":{
             "download_path": "s3://web-parse-huawei/shared_resource/political/25m3.zip",
             "md5": "d0d14a561f987763d654165b536b5858",
         },
+        "political-25m3_cpu":{
+            "download_path": "s3://web-parse-huawei/shared_resource/political/25m3_cpu.zip",
+            "md5": "926359a393de6a36c1b4be403711767f",
+        },
     },
 ```
 
 ## 调用方法
 
-1. 旧的接口调用方法如下：
+1. 25m3_cpu模型接口调用方法如下：
 
 ```python
 from llm_web_kit.model.politics_detector import *
@@ -81,7 +81,7 @@ print(political_filter_cpu(text, "en"))
 # 输出结果为：{'political_prob': 1.0000100135803223}
 ```
 
-2. 新的接口调用方法如下：
+2. 25m3模型接口调用方法如下：
 
 ```python
 from llm_web_kit.model.model_impl import ModelFactory, ModelType, DeviceType
@@ -113,7 +113,7 @@ for i in range(0, len(requests), batch_size):
 
 ## 运行时间
 
-1. 旧的接口（political_filter_cpu）
+1. 25m3_cpu模型接口（political_filter_cpu）
 
    使用型号为`AMD EPYC 7742`的cpu单核进行测试，测试集总共有 77861 条数据（均是中英文的数据），下面只统计了political_filter_cpu接口本身的耗时，排除了数据读取的时间。
 
@@ -127,7 +127,7 @@ for i in range(0, len(requests), batch_size):
 
    每秒可处理: 416.3049条数据
 
-2. 新的接口（predictor.predict_batch）
+2. 25m3模型接口（predictor.predict_batch）
 
    使用单卡NVIDIA A100测试涉政的GPU模型，测试集共有39111条数据，下面统计了不同batch_size下，predictor.predict_batch接口的速度，该接口内部包括tokenize和模型推理操作。
 
@@ -159,3 +159,29 @@ for i in range(0, len(requests), batch_size):
    | 128        | 31.580092769179686 |
    | 256        | 24.26296225431703  |
    | 512        | cuda out of memory |
+
+## 性能说明
+
+25m3_cpu模型（threshold=0.5）：
+
+测试集路径：s3://xyz-process-ylk2/xyz-users/huyucheng1/political_data_202502/test/
+
+| 指标          | 新模型值             | 旧模型值             |
+| ------------- | -------------------- | -------------------- |
+| **F1**        | 0.9089603520041284   | 0.8831507760632497   |
+| **Accuracy**  | 0.8624864742896118   | 0.8013861609546715   |
+| **Precision** | 0.9041776426882809   | 0.7913184992146802   |
+| **Recall**    | 0.9137939273134369   | 0.999095513748191    |
+| **TN**        | 68641                | 19820                |
+| **FP**        | 28373                | 77194                |
+| **FN**        | 25257                | 265                  |
+| **TP**        | 267727               | 292719               |
+| **Prec_Pos**  | 0.9041776426882809   | 0.7913184992146802   |
+| **Recl_Pos**  | 0.9137939273134369   | 0.999095513748191    |
+| **F1_Pos**    | 0.9089603520041284   | 0.8831507760632497   |
+| **Prec_Neg**  | 0.7310166350720995   | 0.986806074184715    |
+| **Recl_Neg**  | 0.7075370565073082   | 0.204300410250067    |
+| **F1_Neg**    | 0.719085232986926    | 0.3385169813576546   |
+| **qps**       | 1493.477337807 条/秒 | 1674.157845704 条/秒 |
+
+注：上述指标均是在集群中得出，单核运行时间请参考运行时间第一小节
diff --git a/docs/llm_web_kit/model/rule_based_safety_module.md b/docs/llm_web_kit/model/rule_based_safety_module.md
@@ -10,8 +10,8 @@
             "cache_path": "~/.llm_web_kit_cache"
         },
         "unsafe_words":{
-            "download_path": "s3://web-parse-huawei/shared_resource/political/unsafe_words.jsonl",
-            "md5": "e81dd1050a79f68b9d9b3f66baadde66",
+            "download_path": "s3://web-parse-huawei/shared_resource/unsafe_words/unsafe_words_porn_politics.jsonl",
+            "md5": "ef51faf114353d987ec97b211a8d2b06",
         },
         "xyz_internal_unsafe_words":{
             "download_path": "s3://web-parse-huawei/shared_resource/political/xyz_internal_unsafe_words.jsonl",
@@ -51,6 +51,32 @@ m.process("your content",
  'safety_infos': {'domain_level': '', 'hit_unsafe_words': False}}
 ```
 
+### 敏感词检测模块用法示例
+
+```python
+from llm_web_kit.model.unsafe_words_detector import *
+
+checker = UnsafeWordChecker(language="zh-en")
+
+content = "64式销售QQ"
+unsafe_words = checker.check_unsafe_words(
+    content_str=content,
+)
+print(unsafe_words)
+[{'word': '64式', 'type': '违禁品', 'level': 'L3', 'language': 'zh', 'count': 1.0}, {'word': '64式销售', 'type': '违禁品', 'level': 'L3', 'language': 'zh', 'count': 1.0}, {'word': '销售', 'type': '广告营销', 'level': 'L3', 'language': 'zh', 'count': 1.0}, {'word': '64式销售qq', 'type': '违禁品', 'level': 'L1', 'language': 'zh', 'count': 1.0}]
+
+checker = UnsafeWordsFilter()
+content = "64式销售QQ"
+#from_safe_source:是否来自安全来源。如果是，直接返回安全。
+#from_domestic_source: 是否来自国内来源。如果是，仅检查 L1 级别的不安全词；否则检查 L1 和 L2 级别。
+result = checker.filter(
+    content,
+    'zh',
+    from_safe_source = False,
+    from_domestic_source = True,
+)
+```
+
 ## 速度
 
 ### 整体速度：

diff --git a/llm_web_kit/model/model_impl.py b/llm_web_kit/model/model_impl.py
@@ -112,7 +112,7 @@ def convert_result_to_response(self, result: dict) -> ModelResponse:
         # raise NotImplementedError
         # TODO convert result to response ensure the threshold
         return PoliticalResponse(
-            is_remained=result['political_prob'] > 0.99, details=result
+            is_remained=result['political_prob'] > 0.89, details=result
         )
 
 

diff --git a/llm_web_kit/model/politics_detector.py b/llm_web_kit/model/politics_detector.py
@@ -27,20 +27,20 @@ def __init__(self, model_path: str = None):
         if not model_path:
             model_path = self.auto_download()
         model_bin_path = os.path.join(model_path, 'model.bin')
-        tokenizer_path = os.path.join(model_path, 'internlm2-chat-20b')
+        tokenizer_path = os.path.join(model_path, 'qwen2.5_7b_tokenizer')
 
         self.model = fasttext.load_model(model_bin_path)
         self.tokenizer = transformer.AutoTokenizer.from_pretrained(
             tokenizer_path, use_fast=False, trust_remote_code=True
         )
 
     def auto_download(self):
-        """Default download the 24m7.zip model."""
-        resource_name = 'political-24m7'
+        """Default download the 25m3_cpu.zip model."""
+        resource_name = 'political-25m3_cpu'
         resource_config = load_config()['resources']
-        political_24m7_config: dict = resource_config[resource_name]
-        political_24m7_s3 = political_24m7_config['download_path']
-        political_24m7_md5 = political_24m7_config.get('md5', '')
+        political_25m3_cpu_config: dict = resource_config[resource_name]
+        political_25m3_cpu_s3 = political_25m3_cpu_config['download_path']
+        political_25m3_cpu_md5 = political_25m3_cpu_config.get('md5', '')
         # get the zip path calculated by the s3 path
         zip_path = os.path.join(CACHE_DIR, f'{resource_name}.zip')
         # the unzip path is calculated by the zip path
@@ -52,9 +52,9 @@ def auto_download(self):
             logger.info(f'try to unzip from zip_path: {zip_path}')
             if not os.path.exists(zip_path):
                 logger.info(f'zip_path: {zip_path} does not exist')
-                logger.info(f'downloading {political_24m7_s3}')
+                logger.info(f'downloading {political_25m3_cpu_s3}')
                 zip_path = download_auto_file(
-                    political_24m7_s3, zip_path, political_24m7_md5
+                    political_25m3_cpu_s3, zip_path, political_25m3_cpu_md5
                 )
             logger.info(f'unzipping {zip_path}')
             unzip_path = unzip_local_file(zip_path, unzip_path)
@@ -195,7 +195,7 @@ def get_singleton_political_detect() -> PoliticalDetector:
 def decide_political_by_prob(
     predictions: Tuple[str], probabilities: Tuple[float]
 ) -> float:
-    idx = predictions.index('__label__normal')
+    idx = predictions.index('__label__positive')
     normal_score = probabilities[idx]
     return float(normal_score)
 
@@ -226,8 +226,6 @@ def political_filter_cpu(data_dict: Dict[str, Any], language: str):
 
 if __name__ == '__main__':
     test_cases = []
-    test_cases.append('你好，我很高兴见到你！')
-    test_cases.append('hello, nice to meet you!')
     test_cases.append('你好，唔該幫我一個忙？')
     test_cases.append('Bawo ni? Mo nife Yoruba. ')
     test_cases.append(

diff --git a/llm_web_kit/model/porn_detector.py b/llm_web_kit/model/porn_detector.py
@@ -172,7 +172,7 @@ def __init__(self, model_path: str = None) -> None:
             model_config = json.load(reader)
         self.cls_index = int(model_config.get('cls_index', 1))
         self.use_sigmoid = bool(model_config.get('use_sigmoid', False))
-        self.max_tokens = int(model_config.get('max_tokens', 300))
+        self.max_tokens = int(model_config.get('max_tokens', 512))
         self.remain_tail = min(
             self.max_tokens - 1, int(model_config.get('remain_tail', -1))
         )

diff --git a/llm_web_kit/model/unsafe_words_detector.py b/llm_web_kit/model/unsafe_words_detector.py
@@ -68,7 +68,7 @@
     unsafe_words_file_path = auto_download(language)
     t2 = time.time()
     print(
-        f'-----------------auto_download cost time: {t2-t1} , language: {language}------------------'
+        f'-----------------auto_download cost time: {t2 - t1} , language: {language}------------------'
     )
     with open(unsafe_words_file_path, 'r') as f:
         lines = f.readlines()
@@ -85,6 +85,8 @@
     words = {}
     for line in lines:
         w = json_loads(line)
+        if w.get('tag') == 'delete':
+            continue
         word = str(w.get('word') or '').lower()
         if not word:
             continue
@@ -163,7 +165,7 @@
         self.ac = get_ac(language)
         t2 = time.time()
         print(
-            f'---------------UnsafeWordChecker init time: {t2-t1} , language: {language}-----------------'
+            f'---------------UnsafeWordChecker init time: {t2 - t1} , language: {language}-----------------'
         )
 
     def check_unsafe_words(self, content_str: str) -> list:

diff --git a/tests/llm_web_kit/model/test_model_impl.py b/tests/llm_web_kit/model/test_model_impl.py
@@ -92,14 +92,14 @@ def test_convert_result_to_response(self, mock_load_model):
         mock_load_model.return_value = MagicMock()
         model = PoliticalCPUModel()
 
-        # Test case where political_prob > 0.99 (should be flagged)
-        result = {'political_prob': 0.995}
+        # Test case where political_prob > 0.89 (should be flagged)
+        result = {'political_prob': 0.9}
         response = model.convert_result_to_response(result)
         assert response.is_remained
         assert response.details == result
 
-        # Test case where political_prob <= 0.99 (should not be flagged)
-        result = {'political_prob': 0.985}
+        # Test case where political_prob <= 0.89 (should not be flagged)
+        result = {'political_prob': 0.88}
         response = model.convert_result_to_response(result)
         assert not response.is_remained
         assert response.details == result