Source code for qianfan.evaluation.evaluator

# Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
collection of evaluator
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union

from qianfan.evaluation.consts import (
    QianfanRefereeEvaluatorDefaultMaxScore,
    QianfanRefereeEvaluatorDefaultMetrics,
    QianfanRefereeEvaluatorDefaultSteps,
)
from qianfan.utils import log_error
from qianfan.utils.pydantic import BaseModel, Field, root_validator


[docs]class Evaluator(BaseModel, ABC): """an class for evaluating single entry"""
[docs] @abstractmethod def evaluate( self, input: Union[str, List[Dict[str, Any]]], reference: str, output: str ) -> Dict[str, Any]: """evaluate one entry"""
[docs]class LocalEvaluator(Evaluator, ABC): """ Bass class for evaluator running locally For user who want to implement their own LocalEvaluator, they should overwrite function `evaluate`, in which input represents input string or chat history, reference as standard answer of input, and output for llm output string. And the return value should be a Dict containing evaluation metrics and metric values for single llm output. """
[docs]class QianfanEvaluator(Evaluator): """empty implementation base class for qianfan evaluator"""
[docs] def evaluate( self, input: Union[str, List[Dict[str, Any]]], reference: str, output: str ) -> Dict[str, Any]: # 因为这个方法并不应该被实现,所以此处返回空值 return {}
[docs]class QianfanRefereeEvaluator(QianfanEvaluator): """qianfan referee evaluator config class""" app_id: int prompt_metrics: str = Field(default=QianfanRefereeEvaluatorDefaultMetrics) prompt_steps: str = Field(default=QianfanRefereeEvaluatorDefaultSteps) prompt_max_score: int = Field(default=QianfanRefereeEvaluatorDefaultMaxScore)
[docs]class QianfanRuleEvaluator(QianfanEvaluator): """qianfan rule evaluator config class""" using_similarity: bool = Field(default=False) using_accuracy: bool = Field(default=False) stop_words: Optional[str] = Field(default=None)
[docs]class ManualEvaluatorDimension(BaseModel): """dimension used for manual mode""" dimension: str description: Optional[str] = Field(default=None)
[docs]class QianfanManualEvaluator(QianfanEvaluator): """qianfan manual evaluator config class""" evaluation_dimensions: List[ManualEvaluatorDimension] = Field( default=[ManualEvaluatorDimension(dimension="满意度")] )
[docs] @root_validator @classmethod def dimension_validation(cls, input_dict: Any) -> Any: assert isinstance(input_dict, dict) dimensions: List[ManualEvaluatorDimension] = input_dict.get( "evaluation_dimensions", [] ) if not dimensions: err_msg = "no dimension has been provided" log_error(err_msg) raise ValueError(err_msg) for i in range(len(dimensions)): if dimensions[i].dimension == "满意度": if i != 0: dimensions[0], dimensions[i] = dimensions[i], dimensions[0] input_dict["evaluation_dimensions"] = dimensions return input_dict input_dict["evaluation_dimensions"] = [ ManualEvaluatorDimension(dimension="满意度") ] + dimensions return input_dict