Source code for qianfan.common.client.evaluation

# Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from typing import List, Optional, Set

import typer
from rich.console import Console, RenderableType
from rich.pretty import Pretty
from rich.table import Table

from qianfan.common.client.dataset import load_dataset
from qianfan.common.client.utils import (
    credential_required,
    print_error_msg,
    print_info_msg,
    print_warn_msg,
)
from qianfan.errors import InternalError
from qianfan.evaluation import EvaluationManager
from qianfan.evaluation.consts import (
    QianfanRefereeEvaluatorDefaultMaxScore,
    QianfanRefereeEvaluatorDefaultMetrics,
    QianfanRefereeEvaluatorDefaultSteps,
)
from qianfan.evaluation.evaluator import (
    ManualEvaluatorDimension,
    QianfanEvaluator,
    QianfanManualEvaluator,
    QianfanRefereeEvaluator,
    QianfanRuleEvaluator,
)
from qianfan.model import Model
from qianfan.resources.console.model import Model as ModelResource

evaluation_app = typer.Typer(
    no_args_is_help=True,
    context_settings={"help_option_names": ["-h", "--help"]},
    help="Evaluation utils.",
)

RULE_EVALUATOR_PANEL = "Rule Evaluator Options"
REFEREE_EVALUATOR_PANEL = "Referee Evaluator Options"
MANUAL_EVALUATOR_PANEL = "Manual Evaluator Options"


[docs]@credential_required def list_evaluable_models( ctx: typer.Context, param: typer.CallbackParam, value: bool ) -> None: """ Print models of ChatCompletion and exit. """ if value: model_list = ModelResource.evaluable_model_list()["result"] console = Console() table = Table(show_lines=True) col_list = ["Model Name", "Train Type", "Model Version List"] for col in col_list: table.add_column(col) for model in model_list: row_items: List[RenderableType] = [] row_items.append(f"{model['modelName']}\n[dim]{model['modelIdStr']}[/]") row_items.append(model["trainType"]) version_list = [ f"{version['version']} [dim]({version['modelVersionIdStr']})[/]" for version in model["modelVersionList"] ] row_items.append("\n".join(version_list)) table.add_row(*row_items) console.print(table) raise typer.Exit()
[docs]@evaluation_app.command() @credential_required def run( models: List[str] = typer.Argument( ..., help="List of model version ids to be evaluated." ), dataset_id: str = typer.Option(..., help="Dataset id."), enable_rule_evaluator: bool = typer.Option(False, help="Enable rule evaluator."), using_similarity: bool = typer.Option( QianfanRuleEvaluator().using_similarity, help="Using similarity to evaluate the results.", rich_help_panel=RULE_EVALUATOR_PANEL, ), using_accuracy: bool = typer.Option( QianfanRuleEvaluator().using_accuracy, help="Using accuracy to evaluate the results.", rich_help_panel=RULE_EVALUATOR_PANEL, ), stop_words: Optional[str] = typer.Option( QianfanRuleEvaluator().stop_words, help="Stop words.", rich_help_panel=RULE_EVALUATOR_PANEL, ), enable_referee_evaluator: bool = typer.Option( False, help="Enable referee evaluator." ), app_id: Optional[int] = typer.Option( None, help=( "The appid to which the model belongs to. The model will be used to" " evaluate the results." ), rich_help_panel=REFEREE_EVALUATOR_PANEL, ), prompt_metrics: str = typer.Option( QianfanRefereeEvaluatorDefaultMetrics, help="Metrics for the model to evaluate the results.", rich_help_panel=REFEREE_EVALUATOR_PANEL, ), prompt_steps: str = typer.Option( QianfanRefereeEvaluatorDefaultSteps, help="Steps for the model to evaluate the results.", rich_help_panel=REFEREE_EVALUATOR_PANEL, ), prompt_max_score: int = typer.Option( QianfanRefereeEvaluatorDefaultMaxScore, help="Max score of the evaluation result.", rich_help_panel=REFEREE_EVALUATOR_PANEL, ), enable_manual_evaluator: bool = typer.Option( False, help="Enable manual evaluator." ), dimensions: Optional[str] = typer.Option( None, help="Dimensions for evaluation. Use ',' to split multiple dimensions.", rich_help_panel=MANUAL_EVALUATOR_PANEL, ), list_evaluable_models: Optional[bool] = typer.Option( None, "--list-evaluable-models", callback=list_evaluable_models, is_eager=True, help="Print evaluable models.", ), ) -> None: """ Run evaluation task. At least one evaluator should be enabled. Manual evaluator may not be mixed with other evaluators. """ ds = load_dataset(dataset_id, is_download_to_local=False) model_list = [Model(version_id=m) for m in models] console = Console() evaluators: List[QianfanEvaluator] = [] if enable_rule_evaluator: evaluators.append( QianfanRuleEvaluator( using_accuracy=using_accuracy, using_similarity=using_similarity, stop_words=stop_words, ) ) if enable_referee_evaluator: if app_id is None: print_error_msg("App_id is required for referee evaluator.") raise typer.Exit(1) evaluators.append( QianfanRefereeEvaluator( app_id=app_id, prompt_metrics=prompt_metrics, prompt_steps=prompt_steps, prompt_max_score=prompt_max_score, ) ) if enable_manual_evaluator: if dimensions is None: print_error_msg("Dimensions are required for manual evaluator.") raise typer.Exit(1) if len(evaluators) != 0: print_warn_msg("Manual evaluator may not be mixed with other evaluators.") dimension_list = dimensions.split(",") evaluators.append( QianfanManualEvaluator( evaluation_dimensions=[ ManualEvaluatorDimension(dimension=dim) for dim in dimension_list ] ) ) if len(evaluators) == 0: print_error_msg("At least one evaluator should be enabled.", exit=True) em = EvaluationManager(qianfan_evaluators=evaluators) with console.status("Evaluating..."): result = em.eval(model_list, ds) eval_task_id = em.task_id if eval_task_id is None: raise InternalError("Evaluation task id should not be None") if result is None or result.metrics is None: print_info_msg( "The data has been processed. Since manual evaluator is enabled, please go" " to https://console.bce.baidu.com/qianfan/modelcenter/model/manual/detail/task/{task_id}" " to evalate the results." ) raise typer.Exit(0) table = Table() cols = list(result.metrics.keys()) table.add_column("") for col in cols: table.add_column(col) keys: Set[str] = set() for k, v in result.metrics.items(): keys = keys.union(set(v.keys())) for k in keys: vals = [] for col in cols: vals.append(Pretty(result.metrics[col].get(k, None), overflow="fold")) table.add_row(k, *vals) console.print(table)