# coding: utf-8
# Copyright 2023 IBM All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: Add parameters validation in every method
from ibm_cloud_sdk_core import BaseService
from ibm_watson_openscale.utils.utils import validate_type
# from ibm_watson_openscale.utils.utils import check_plan_usage, update_plan_usage
from ibm_watson_openscale.supporting_classes.metrics.utils import is_entitled_on_cloud
from ibm_cloud_sdk_core.authenticators import BearerTokenAuthenticator
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
[docs]
class LLMMetrics():
def __init__(self, ai_client: "WatsonOpenScaleV2Adapter") -> None:
validate_type(ai_client, "ai_client", BaseService, True)
self.ai_client = ai_client
self.is_bearer_token = False
if type(self.ai_client.authenticator) is BearerTokenAuthenticator:
self.token = self.ai_client.authenticator.bearer_token
self.is_bearer_token = True
else:
self.token = self.ai_client.authenticator.token_manager.get_token()
[docs]
def compute_metrics(self, configuration: dict, sources: pd.DataFrame = None, predictions: pd.DataFrame = None, references: pd.DataFrame = None, custom_evaluators=[], **kwargs):
"""
Compute LLM based metrics based on the configuration.
:param DataFrame sources: data frame containing the input data (if required or else empty dataframe).
:param DataFrame predictions: data frame containing the input data (if required or else empty dataframe).
:param DataFrame references: data frame containing the referene data (if required or else empty dataframe).
:param List custom_evaluators: List of custom evaluator functions that compute additional custom metrics
:return: Key/Value pair where key is the metric name and value is an object consisting of the metric results for all individual metrics.
:rtype: dict
This is how the configuration parameter dict will look like
>>>
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMTextMetricGroup, LLMSummarizationMetrics, HAP_SCORE
metric_config = {
"configuration": {
LLMTextMetricGroup.SUMMARIZATION.value: { #This is metric group
LLMSummarizationMetrics.ROUGE_SCORE.value: { #This is individual metric and contains it's specific parameters if required
"use_aggregator": True,
"use_stemmer": True
},
LLMSummarizationMetrics.SARI.value: { #This is individual metric and contains it's specific parameters if required
},
LLMSummarizationMetrics.BLEURT_SCORE.value: {},
HAP_SCORE: {},
LLMSummarizationMetrics.SACREBLEU.value: {},
LLMSummarizationMetrics.WIKI_SPLIT.value: {},
LLMSummarizationMetrics.METEOR.value: {},
LLMSummarizationMetrics.NORMALIZED_RECALL.value: {},
LLMSummarizationMetrics.NORMALIZED_PRECISION.value: {},
LLMSummarizationMetrics.NORMALIZED_F1_SCORE.value: {},
}
}
}
A way you might use me is:
>>> client.llm_metrics.compute_metrics(configuration, sources, predictions, references)
User can pass custom_evaluators as argument to compute custom metrics.
eg: def fun1(sources: pd.DataFrame, predictions: pd.DataFrame, references: pd.DataFrame):
# compute custom metrics and returns it as a dict
custom_evaluators = [fun1]
>>> client.llm_metrics.compute_metrics(configuration, sources, predictions, references, custom_evaluators = custom_evaluators)
"""
metrics = {}
self.__validate_params(configuration, sources, predictions, references)
llm_metric_manager = self.__get_metrics_manager(
configuration, kwargs)
metrics = llm_metric_manager.compute(
sources, predictions, references, **kwargs)
for fun in custom_evaluators:
custom_metric = fun(sources, predictions, references, **kwargs)
metrics.update(custom_metric)
# if not self.ai_client.is_cp4d:
# update_plan_usage(self.ai_client)
return metrics
[docs]
def get_metrics_result(self, configuration: dict, metrics_result, **kwargs):
"""
Get the result of metrics which are run on the server. Used for faithfulness metric.
:param configuration: The configuration of the metrics to get the response
:param metrics_result: The metrics result dictionary containing the details of the computation tasks triggered. This will be output of the method 'compute_metrics'.
:return: Key/Value pair where key is the metric name and value is an object consisting of the metric results for all individual metrics.
This is how the configuration parameter dict will look like
>>>
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMTextMetricGroup, LLMSummarizationMetrics, HAP_SCORE
metric_config = {
"configuration": {
LLMTextMetricGroup.RAG.value: { #This is metric group
LLMSummarizationMetrics.ROUGE_SCORE.value: { #This is individual metric and contains it's specific parameters if required
"use_aggregator": True,
"use_stemmer": True
},
LLMSummarizationMetrics.FAITHFULNESS.value: { #This is individual metric and contains it's specific parameters if required
},
LLMSummarizationMetrics.ANSWER_RELEVANCE.value: {}
}
}
}
A way you might use me is:
>>> metrics_result = client.llm_metrics.compute_metrics(configuration, sources, predictions, references)
>>> final_result = client.llm_metrics.get_metrics_result(configuration=configuration, metrics_result=metrics_result)
"""
validate_type(configuration, "configuration", dict, True)
llm_metric_manager = self.__get_metrics_manager(
configuration, kwargs)
return llm_metric_manager.get_metrics_result(
metrics_result, **kwargs)
def __get_metrics_manager(self, configuration, kwargs):
try:
metric_manager_module = __import__(
"ibm_metrics_plugin.metrics.llm.core.llm_metrics_manager", fromlist=["LLMMetricManager"])
except Exception as e:
msg = "Unable to find metric-plugins library with LLM support to compute metrics. Please install it using `pip install ibm-metrics-plugin`"
raise Exception(msg)
self.__check_entitlement()
# Allow user to compute metrics only for a plan having the usage within limit on cloud
# if not self.ai_client.is_cp4d:
# check_plan_usage(self.ai_client)
llm_metric_manager = getattr(
metric_manager_module, "LLMMetricManager")(configuration, **kwargs)
kwargs["is_bearer_token"] = self.is_bearer_token
kwargs["authenticator"] = self.ai_client.authenticator
kwargs["service_url"] = self.ai_client.service_url
kwargs["is_cp4d"] = self.ai_client.is_cp4d
kwargs["origin"] = "sdk"
return llm_metric_manager
[docs]
def show_supported_metrics(self):
"""
List all the supported LLM based metrics for different prompt types
"""
self.__show_supported_metrics()
def __show_supported_metrics(self):
try:
metric_type_module = __import__("ibm_metrics_plugin.metrics.llm.utils.constants", fromlist=[
"LLMSummarizationMetrics,LLMGenerationMetrics, LLMExtractionMetrics, LLMQAMetrics"])
except Exception as e:
msg = "Unable to find metric-plugins library with LLM support to list metrics. Please install it using `pip install ibm-metrics-plugin`"
raise Exception(msg)
metric_type = getattr(metric_type_module, "LLMSummarizationMetrics")
print("Following Text Summrization metrics are supported")
for m in metric_type:
print(" {}".format(m))
print(" ---------- ")
metric_type = getattr(metric_type_module, "LLMGenerationMetrics")
print("Following Text Generation metrics are supported")
for m in metric_type:
print(" {}".format(m))
print(" ---------- ")
metric_type = getattr(metric_type_module, "LLMExtractionMetrics")
print("Following Text Extraction metrics are supported")
for m in metric_type:
print(" {}".format(m))
print(" ---------- ")
metric_type = getattr(metric_type_module, "LLMQAMetrics")
print("Following Question and Answer metrics are supported")
for m in metric_type:
print(" {}".format(m))
print(" ---------- ")
def __validate_params(self, configuration, sources, predictions, references):
validate_type(configuration, "configuration", dict, True)
validate_type(sources, "data_frame", [pd.DataFrame], False)
validate_type(predictions, "data_frame", [pd.DataFrame], False)
validate_type(references, "data_frame", [pd.DataFrame], False)
[docs]
def display_result(self, results):
try:
from ibm_metrics_plugin.metrics.llm.common.impl.robustness_metric import RobustnessMetrics
except Exception:
msg = "Unable to find metric-plugins library. Please install it "
raise Exception(msg)
return RobustnessMetrics.display_robustness_results(results)
def __check_entitlement(self):
# Allow user to compute metrics only if he has valid paid plan on cloud
if self.ai_client.is_cp4d is not True:
is_entitled_on_cloud(self.ai_client.service_url,
self.ai_client.service_instance_id, self.token)