# Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional, Set
import typer
from rich.console import Console, RenderableType
from rich.pretty import Pretty
from rich.table import Table
from qianfan.common.client.dataset import load_dataset
from qianfan.common.client.utils import (
credential_required,
print_error_msg,
print_info_msg,
print_warn_msg,
)
from qianfan.errors import InternalError
from qianfan.evaluation import EvaluationManager
from qianfan.evaluation.consts import (
QianfanRefereeEvaluatorDefaultMaxScore,
QianfanRefereeEvaluatorDefaultMetrics,
QianfanRefereeEvaluatorDefaultSteps,
)
from qianfan.evaluation.evaluator import (
ManualEvaluatorDimension,
QianfanEvaluator,
QianfanManualEvaluator,
QianfanRefereeEvaluator,
QianfanRuleEvaluator,
)
from qianfan.model import Model
from qianfan.resources.console.model import Model as ModelResource
evaluation_app = typer.Typer(
no_args_is_help=True,
context_settings={"help_option_names": ["-h", "--help"]},
help="Evaluation utils.",
)
RULE_EVALUATOR_PANEL = "Rule Evaluator Options"
REFEREE_EVALUATOR_PANEL = "Referee Evaluator Options"
MANUAL_EVALUATOR_PANEL = "Manual Evaluator Options"
[docs]@credential_required
def list_evaluable_models(
ctx: typer.Context, param: typer.CallbackParam, value: bool
) -> None:
"""
Print models of ChatCompletion and exit.
"""
if value:
model_list = ModelResource.evaluable_model_list()["result"]
console = Console()
table = Table(show_lines=True)
col_list = ["Model Name", "Train Type", "Model Version List"]
for col in col_list:
table.add_column(col)
for model in model_list:
row_items: List[RenderableType] = []
row_items.append(f"{model['modelName']}\n[dim]{model['modelIdStr']}[/]")
row_items.append(model["trainType"])
version_list = [
f"{version['version']} [dim]({version['modelVersionIdStr']})[/]"
for version in model["modelVersionList"]
]
row_items.append("\n".join(version_list))
table.add_row(*row_items)
console.print(table)
raise typer.Exit()
[docs]@evaluation_app.command()
@credential_required
def run(
models: List[str] = typer.Argument(
..., help="List of model version ids to be evaluated."
),
dataset_id: str = typer.Option(..., help="Dataset id."),
enable_rule_evaluator: bool = typer.Option(False, help="Enable rule evaluator."),
using_similarity: bool = typer.Option(
QianfanRuleEvaluator().using_similarity,
help="Using similarity to evaluate the results.",
rich_help_panel=RULE_EVALUATOR_PANEL,
),
using_accuracy: bool = typer.Option(
QianfanRuleEvaluator().using_accuracy,
help="Using accuracy to evaluate the results.",
rich_help_panel=RULE_EVALUATOR_PANEL,
),
stop_words: Optional[str] = typer.Option(
QianfanRuleEvaluator().stop_words,
help="Stop words.",
rich_help_panel=RULE_EVALUATOR_PANEL,
),
enable_referee_evaluator: bool = typer.Option(
False, help="Enable referee evaluator."
),
app_id: Optional[int] = typer.Option(
None,
help=(
"The appid to which the model belongs to. The model will be used to"
" evaluate the results."
),
rich_help_panel=REFEREE_EVALUATOR_PANEL,
),
prompt_metrics: str = typer.Option(
QianfanRefereeEvaluatorDefaultMetrics,
help="Metrics for the model to evaluate the results.",
rich_help_panel=REFEREE_EVALUATOR_PANEL,
),
prompt_steps: str = typer.Option(
QianfanRefereeEvaluatorDefaultSteps,
help="Steps for the model to evaluate the results.",
rich_help_panel=REFEREE_EVALUATOR_PANEL,
),
prompt_max_score: int = typer.Option(
QianfanRefereeEvaluatorDefaultMaxScore,
help="Max score of the evaluation result.",
rich_help_panel=REFEREE_EVALUATOR_PANEL,
),
enable_manual_evaluator: bool = typer.Option(
False, help="Enable manual evaluator."
),
dimensions: Optional[str] = typer.Option(
None,
help="Dimensions for evaluation. Use ',' to split multiple dimensions.",
rich_help_panel=MANUAL_EVALUATOR_PANEL,
),
list_evaluable_models: Optional[bool] = typer.Option(
None,
"--list-evaluable-models",
callback=list_evaluable_models,
is_eager=True,
help="Print evaluable models.",
),
) -> None:
"""
Run evaluation task.
At least one evaluator should be enabled.
Manual evaluator may not be mixed with other evaluators.
"""
ds = load_dataset(dataset_id, is_download_to_local=False)
model_list = [Model(version_id=m) for m in models]
console = Console()
evaluators: List[QianfanEvaluator] = []
if enable_rule_evaluator:
evaluators.append(
QianfanRuleEvaluator(
using_accuracy=using_accuracy,
using_similarity=using_similarity,
stop_words=stop_words,
)
)
if enable_referee_evaluator:
if app_id is None:
print_error_msg("App_id is required for referee evaluator.")
raise typer.Exit(1)
evaluators.append(
QianfanRefereeEvaluator(
app_id=app_id,
prompt_metrics=prompt_metrics,
prompt_steps=prompt_steps,
prompt_max_score=prompt_max_score,
)
)
if enable_manual_evaluator:
if dimensions is None:
print_error_msg("Dimensions are required for manual evaluator.")
raise typer.Exit(1)
if len(evaluators) != 0:
print_warn_msg("Manual evaluator may not be mixed with other evaluators.")
dimension_list = dimensions.split(",")
evaluators.append(
QianfanManualEvaluator(
evaluation_dimensions=[
ManualEvaluatorDimension(dimension=dim) for dim in dimension_list
]
)
)
if len(evaluators) == 0:
print_error_msg("At least one evaluator should be enabled.", exit=True)
em = EvaluationManager(qianfan_evaluators=evaluators)
with console.status("Evaluating..."):
result = em.eval(model_list, ds)
eval_task_id = em.task_id
if eval_task_id is None:
raise InternalError("Evaluation task id should not be None")
if result is None or result.metrics is None:
print_info_msg(
"The data has been processed. Since manual evaluator is enabled, please go"
" to https://console.bce.baidu.com/qianfan/modelcenter/model/manual/detail/task/{task_id}"
" to evalate the results."
)
raise typer.Exit(0)
table = Table()
cols = list(result.metrics.keys())
table.add_column("")
for col in cols:
table.add_column(col)
keys: Set[str] = set()
for k, v in result.metrics.items():
keys = keys.union(set(v.keys()))
for k in keys:
vals = []
for col in cols:
vals.append(Pretty(result.metrics[col].get(k, None), overflow="fold"))
table.add_row(k, *vals)
console.print(table)