阿里云文本审核(java敏感词效验)

2021-07-08
0

文本敏感词校验

背景: 用户社区评论或分享的文章或评论,需要合法合规,不能包含涉政等违规的铭感词.

使用阿里云的内容审核

准备工作

  • 阿里云账号,需要开通敏感词服务 注意(有accessKeyId和accessKeySecret)都不行,须开通服务
  • 文档地址: https://help.aliyun.com/document_detail/70439.html?spm=a2c4g.11186623.6.701.1cea2188bMuZUi
  • 调试地址:https://next.api.aliyun.com/api/Green/2018-05-09/TextScan

java代码(做一次笔记吧qaq)

  • 依赖
<dependency>
  <groupId>com.aliyun</groupId>
  <artifactId>aliyun-java-sdk-core</artifactId>
  <version>4.5.16</version>
</dependency>
  • 代码
@Slf4j
@Component
public class AliYunWordFilterHandler {
    
    

    private static final String region = "cn-beijing";

    @Value("${aliyun.oss.accessKeyId}")
    private String accessKeyId;

    @Value("${aliyun.oss.accessKeySecret}")
    private String accessKeySecret;

	//设置获取client
    private IAcsClient getClient() {
    
    
        IClientProfile profile = DefaultProfile.getProfile(region, accessKeyId, accessKeySecret);
        //下面走的是阿帕奇的,自行选择
        // DefaultProfile profile1 = DefaultProfile.getProfile(region, accessKeyId, accessKeySecret);         
        return new DefaultAcsClient(profile);
    }
    
	//设置请求头
    private CommonRequest getDefaultCommonRequest() {
    
    
        CommonRequest request = new CommonRequest();
        request.setProtocol(ProtocolType.HTTPS);
        request.setMethod(MethodType.POST);
        request.setDomain("green.cn-beijing.aliyuncs.com");
        request.setVersion("2018-05-09");
        request.setUriPattern("/green/text/scan"); //垃圾扫描    request.setUriPattern("/green/text/feedback"); // 垃圾反馈
        //设置超时时间
        request.setSysConnectTimeout(6000);
        request.setSysReadTimeout(6000);
        request.putHeadParameter("Content-Type", "application/json");
        return request;
    }
    
	//请求参数封装 map
    private Map<String, Object> getExecuteMap(List<String> tasks) {
    
    
        Map<String, Object> resultMap = new HashMap<>(2);
        List<Map<String, Object>> inputBodyList = new ArrayList<>();

        for (String task : tasks) {
    
    
            Map<String, Object> requestBodyMap = new HashMap<String, Object>();
            requestBodyMap.put("dataId", UUID.randomUUID().toString());
            requestBodyMap.put("content", task); // 待检测的文本,长度不超过10000个字符
            inputBodyList.add(requestBodyMap);
        }
        resultMap.put("scenes", Collections.singletonList("antispam")); // 检测场景,文本垃圾检测传递:antispam
        resultMap.put("tasks", inputBodyList);
        return resultMap;
    }
    
	//请求参数封装 JSONObject (linkhashMap)
    private JSONObject getExecuteJSONObject(List<String> tasks) {
    
    
        JSONObject resultMap = new JSONObject();
        JSONArray inputBodyList = new JSONArray();
        for (String task : tasks) {
    
    
            JSONObject requestBody = new JSONObject();
            requestBody.put("dataId", UUID.randomUUID().toString());
            requestBody.put("content", task); // 待检测的文本,长度不超过10000个字符*/
            inputBodyList.add(requestBody);
        }
        resultMap.put("scenes", Collections.singletonList("antispam")); // 检测场景,文本垃圾检测传递:antispam
        resultMap.put("tasks", inputBodyList);
        return resultMap;
    }

    //批量效验,阿里的api 详情批量限制100个,单个长度不能超过10000
	@TimerLog
    public List<AuditInfo> textReviews(List<String> content) {
    
    
        List<AuditInfo> result = new LinkedList<>();
        IAcsClient client = getClient();
        CommonRequest request = getDefaultCommonRequest();
        Map<String, Object> executeMap = getExecuteMap(content);
        log.info("阿里敏感词检测:[start]:\n {}", JSONUtils.toString(executeMap));
        request.setHttpContent(JSONUtils.toString(executeMap).getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8.name(), JSON);
        try {
    
    
            CommonResponse response = client.getCommonResponse(request);
            log.info("阿里敏感词检测:[end]:\n {}", response.getData());
            HttpResponse httpResponse = response.getHttpResponse();
            if (httpResponse.isSuccess()) {
    
    
                log.info("敏感词效验 成功");
                String responseContent = new String(httpResponse.getHttpContent(), StandardCharsets.UTF_8);
                JSONObject scrResponse = JSONUtils.parseObject(responseContent, JSONObject.class);
                if (200 == scrResponse.getInteger("code")) {
    
    
                    JSONArray taskResults = scrResponse.getJSONArray("data");
                    for (int i = 0; i < taskResults.size(); i++) {
    
    
                        JSONObject taskResultObj = taskResults.getJSONObject(i);
                        AuditInfo auditInfo = new AuditInfo();
                        //如果被检测文本命中了自定义关键词词库中的关键词,则会返回当前字段,并将命中的关键词替换为星号(*)。
                        String filteredContent = taskResultObj.getString("filteredContent");
                        auditInfo.setContent(filteredContent);
                        if (200 == taskResultObj.getInteger("code")) {
    
    
                            JSONArray sceneResults = taskResultObj.getJSONArray("results");
                            for (int j = 0; j < sceneResults.size(); j++) {
    
    
                                JSONObject taskSubObject = sceneResults.getJSONObject(j);
                                //这里检测只使用一个 result 结果,检测文本为一个
                                String scene = taskSubObject.getString("scene");
                                //pass:文本正常,可以直接放行。 review:文本需要进一步人工审核。 block:文本违规,可以直接删除或者限制公开。
                                //@see https://help.aliyun.com/document_detail/70439.html?spm=a2c4g.11186623.6.701.1cea2188bMuZUi
                                String suggestion = taskSubObject.getString("suggestion");
                                String label = taskSubObject.getString("label");
                                double rate = taskSubObject.getDouble("rate");
                                auditInfo = convertMsg(label, auditInfo, rate);
                                result.add(auditInfo);
                            }
                        } else {
    
    
                            System.out.println("task process fail:" + taskResultObj.getInteger("code"));
                            log.error("阿里敏感词检测:请求超时!");
                        }
                    }
                } else {
    
    
                    log.error("检测状态失败 code:{}", scrResponse.getInteger("code"));
                }
            }
        } catch (ClientException e) {
    
    
            log.error("请求调用失败,检查是否是超时");
            e.printStackTrace();
        }
        return result;
    }
    
//标签效验
private AuditInfo convertMsg(String label, AuditInfo audit, double rate) {
    
    
        //正常放行  normal:正常文本 spam:含垃圾信息 ad:广告 flood:灌水  meaningless:无意义 customized:自定义(例如命中自定义关键词)
        //拦截 politics:涉政 terrorism:暴恐 abuse:辱骂 porn:色情 contraband:违禁
        audit.setResult(true);
        audit.setMsg("审核正常");
        switch (label) {
    
    
            case "normal":
                break;
            case "spam":
                if (rate > 50.0) {
    
    
                    audit.setResult(true);
                    audit.setMsg("含垃圾信息");
                }
                break;
            case "ad":
                if (rate > 50.0) {
    
    
                    audit.setResult(true);
                    audit.setMsg("广告");
                }
                break;
            case "politics":
                audit.setResult(false);
                audit.setMsg("涉政");
                break;
            case "terrorism":
                audit.setResult(false);
                audit.setMsg("暴恐");
                break;
            case "abuse":
                if (rate > 70.0) {
    
    
                    audit.setResult(false);
                    audit.setMsg("辱骂");
                }
                break;
            case "porn":
                if (rate > 90.0) {
    
    
                    audit.setResult(false);
                    audit.setMsg("色情");
                }
                break;
            case "flood":
                if (rate > 95.0) {
    
    
                    audit.setResult(true);
                    audit.setMsg("灌水");
                }
                break;
            case "contraband":
                audit.setResult(false);
                audit.setMsg("违禁");
                break;
            case "meaningless":
                if (rate > 95.0) {
    
    
                    audit.setResult(true);
                    audit.setMsg("无意义");
                }
                break;
            case "qrcode":
                if (rate > 60.0) {
    
    
                    audit.setResult(true);
                    audit.setMsg("二维码");
                }
                break;
            default:
                audit.setResult(true);
                audit.setMsg("自定义");
                break;
        }
        return audit;
    }


/**
 * @author: craywen
 * @date: 2021-05-25 14:23
 * @desc: 审核结果
 */
@Data
public class AuditInfo {
    
    

    /**
     * 审核结果
     */
    private boolean result;

    /**
     * 返回的消息
     */
    private String msg;

    /**
     * 内容
     */
    private String content;

}

https://blog.csdn.net/qq_38893133/article/details/117294596

相关信息

评论