feat: add 12 model-based pointwise metric classes to `vertexai.previe…

…w.evaluation.metrics` PiperOrigin-RevId: 644484669
googleapis · Jun 18, 2024 · 4742a87 · 4742a87
1 parent 04e07db
commit 4742a87
Show file tree

Hide file tree

Showing 14 changed files with 525 additions and 1 deletion.
diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
@@ -31,6 +31,9 @@
 from vertexai.preview.evaluation import _base as eval_base
 from vertexai.preview.evaluation import _evaluation
 from vertexai.preview.evaluation import utils
+from vertexai.preview.evaluation.metrics import (
+    _summarization_quality,
+)
 from vertexai.preview.evaluation.metrics import (
     _pairwise_summarization_quality,
 )
@@ -318,6 +321,73 @@ def test_compute_pointwise_metrics(self, api_transport):
             0.5,
         ]
 
+    @pytest.mark.parametrize("api_transport", ["grpc", "rest"])
+    def test_compute_pointwise_metrics_with_custom_metric_spec(self, api_transport):
+        aiplatform.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+            api_transport=api_transport,
+        )
+        eval_dataset = pd.DataFrame(
+            {
+                "context": ["test", "context"],
+                "instruction": ["test", "instruction"],
+                "reference": ["test", "reference"],
+            }
+        )
+        mock_model = mock.create_autospec(
+            generative_models.GenerativeModel, instance=True
+        )
+        mock_model.generate_content.return_value = _MOCK_MODEL_INFERENCE_RESPONSE
+        mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
+        test_metrics = [
+            _summarization_quality.SummarizationQuality(
+                use_reference=True,
+            )
+        ]
+        test_eval_task = evaluation.EvalTask(dataset=eval_dataset, metrics=test_metrics)
+        mock_metric_results = _MOCK_SUMMARIZATION_QUALITY_RESULT
+        with mock.patch.object(
+            target=gapic_evaluation_services.EvaluationServiceAsyncClient,
+            attribute="evaluate_instances",
+            side_effect=mock_metric_results,
+        ):
+            test_result = test_eval_task.evaluate(
+                model=mock_model,
+                prompt_template="{instruction} test prompt template {context}",
+            )
+
+        assert test_result.summary_metrics["row_count"] == 2
+        assert test_result.summary_metrics["summarization_quality/mean"] == 4.5
+        assert test_result.summary_metrics[
+            "summarization_quality/std"
+        ] == pytest.approx(0.7, 0.1)
+        assert set(test_result.metrics_table.columns.values) == set(
+            [
+                "context",
+                "instruction",
+                "reference",
+                "completed_prompt",
+                "response",
+                "summarization_quality",
+                "summarization_quality/explanation",
+                "summarization_quality/confidence",
+            ]
+        )
+        assert list(test_result.metrics_table["summarization_quality"].values) == [5, 4]
+        assert list(
+            test_result.metrics_table["summarization_quality/explanation"].values
+        ) == [
+            "explanation",
+            "explanation",
+        ]
+        assert list(
+            test_result.metrics_table["summarization_quality/confidence"].values
+        ) == [
+            1.0,
+            0.5,
+        ]
+
     @pytest.mark.parametrize("api_transport", ["grpc", "rest"])
     def test_compute_pairwise_metrics_with_model_inference(self, api_transport):
         aiplatform.init(

diff --git a/vertexai/preview/evaluation/metrics/__init__.py b/vertexai/preview/evaluation/metrics/__init__.py
@@ -16,16 +16,84 @@
 #
 """Evaluation Metrics Module."""
 
+from vertexai.preview.evaluation.metrics import _base
+from vertexai.preview.evaluation.metrics import _coherence
+from vertexai.preview.evaluation.metrics import _fluency
+from vertexai.preview.evaluation.metrics import _fulfillment
+from vertexai.preview.evaluation.metrics import _groundedness
 from vertexai.preview.evaluation.metrics import (
-    _base,
+    _pairwise_question_answering_quality,
 )
+from vertexai.preview.evaluation.metrics import (
+    _pairwise_summarization_quality,
+)
+from vertexai.preview.evaluation.metrics import (
+    _question_answering_correctness,
+)
+from vertexai.preview.evaluation.metrics import (
+    _question_answering_helpfulness,
+)
+from vertexai.preview.evaluation.metrics import (
+    _question_answering_quality,
+)
+from vertexai.preview.evaluation.metrics import (
+    _question_answering_relevance,
+)
+from vertexai.preview.evaluation.metrics import _safety
+from vertexai.preview.evaluation.metrics import (
+    _summarization_helpfulness,
+)
+from vertexai.preview.evaluation.metrics import (
+    _summarization_quality,
+)
+from vertexai.preview.evaluation.metrics import (
+    _summarization_verbosity,
+)
+
 
 CustomMetric = _base.CustomMetric
 PairwiseMetric = _base.PairwiseMetric
 make_metric = _base.make_metric
 
+Coherence = _coherence.Coherence
+Fluency = _fluency.Fluency
+Safety = _safety.Safety
+Groundedness = _groundedness.Groundedness
+Fulfillment = _fulfillment.Fulfillment
+SummarizationQuality = _summarization_quality.SummarizationQuality
+SummarizationHelpfulness = _summarization_helpfulness.SummarizationHelpfulness
+SummarizationVerbosity = _summarization_verbosity.SummarizationVerbosity
+QuestionAnsweringQuality = _question_answering_quality.QuestionAnsweringQuality
+QuestionAnsweringRelevance = _question_answering_relevance.QuestionAnsweringRelevance
+QuestionAnsweringHelpfulness = (
+    _question_answering_helpfulness.QuestionAnsweringHelpfulness
+)
+QuestionAnsweringCorrectness = (
+    _question_answering_correctness.QuestionAnsweringCorrectness
+)
+PairwiseSummarizationQuality = (
+    _pairwise_summarization_quality.PairwiseSummarizationQuality
+)
+PairwiseQuestionAnsweringQuality = (
+    _pairwise_question_answering_quality.PairwiseQuestionAnsweringQuality
+)
+
 __all__ = [
     "CustomMetric",
     "PairwiseMetric",
     "make_metric",
+    "Coherence",
+    "Fluency",
+    "Safety",
+    "Groundedness",
+    "Fulfillment",
+    "SummarizationQuality",
+    "SummarizationHelpfulness",
+    "SummarizationVerbosity",
+    "QuestionAnsweringQuality",
+    "QuestionAnsweringRelevance",
+    "QuestionAnsweringHelpfulness",
+    "QuestionAnsweringCorrectness",
+    "PairwiseSummarizationQuality",
+    "PairwiseQuestionAnsweringQuality",
 ]
diff --git a/vertexai/preview/evaluation/metrics/_coherence.py b/vertexai/preview/evaluation/metrics/_coherence.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Optional
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class Coherence(_base._ModelBasedMetric):
+    """The model-based pointwise metric for Coherence."""
+
+    _metric_name = constants.Metric.COHERENCE
+
+    def __init__(self, *, version: Optional[int] = None):
+        super().__init__(
+            metric=Coherence._metric_name,
+            version=version,
+        )
diff --git a/vertexai/preview/evaluation/metrics/_fluency.py b/vertexai/preview/evaluation/metrics/_fluency.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Optional
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class Fluency(_base._ModelBasedMetric):
+    """The model-based pointwise metric for Fluency."""
+
+    _metric_name = constants.Metric.FLUENCY
+
+    def __init__(self, *, version: Optional[int] = None):
+        super().__init__(
+            metric=Fluency._metric_name,
+            version=version,
+        )
diff --git a/vertexai/preview/evaluation/metrics/_fulfillment.py b/vertexai/preview/evaluation/metrics/_fulfillment.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Optional
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class Fulfillment(_base._ModelBasedMetric):
+    """The model-based pointwise metric for Fulfillment."""
+
+    _metric_name = constants.Metric.FULFILLMENT
+
+    def __init__(self, *, version: Optional[int] = None):
+        super().__init__(
+            metric=Fulfillment._metric_name,
+            version=version,
+        )
diff --git a/vertexai/preview/evaluation/metrics/_groundedness.py b/vertexai/preview/evaluation/metrics/_groundedness.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Optional
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class Groundedness(_base._ModelBasedMetric):
+    """The model-based pointwise metric for Groundedness."""
+
+    _metric_name = constants.Metric.GROUNDEDNESS
+
+    def __init__(self, *, version: Optional[int] = None):
+        super().__init__(
+            metric=Groundedness._metric_name,
+            version=version,
+        )
diff --git a/vertexai/preview/evaluation/metrics/_question_answering_correctness.py b/vertexai/preview/evaluation/metrics/_question_answering_correctness.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class QuestionAnsweringCorrectness(_base._ModelBasedMetric):
+    """The model-based pointwise metric for Question Answering Correctness."""
+
+    _metric_name = constants.Metric.QUESTION_ANSWERING_CORRECTNESS
+
+    def __init__(self, *, use_reference: bool = True, version: Optional[int] = None):
+        super().__init__(
+            metric=QuestionAnsweringCorrectness._metric_name,
+            use_reference=use_reference,
+            version=version,
+        )
diff --git a/vertexai/preview/evaluation/metrics/_question_answering_helpfulness.py b/vertexai/preview/evaluation/metrics/_question_answering_helpfulness.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class QuestionAnsweringHelpfulness(_base._ModelBasedMetric):
+    """The model-based pointwise metric for Question Answering Helpfulness."""
+
+    _metric_name = constants.Metric.QUESTION_ANSWERING_HELPFULNESS
+
+    def __init__(self, *, use_reference: bool = False, version: Optional[int] = None):
+        super().__init__(
+            metric=QuestionAnsweringHelpfulness._metric_name,
+            use_reference=use_reference,
+            version=version,
+        )
diff --git a/vertexai/preview/evaluation/metrics/_question_answering_quality.py b/vertexai/preview/evaluation/metrics/_question_answering_quality.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation.metrics import _base
+
+
+class QuestionAnsweringQuality(_base._ModelBasedMetric):
+    """The model-based pointwise metric for Question Answering Quality."""
+
+    _metric_name = constants.Metric.QUESTION_ANSWERING_QUALITY
+
+    def __init__(self, *, use_reference: bool = False, version: Optional[int] = None):
+        super().__init__(
+            metric=QuestionAnsweringQuality._metric_name,
+            use_reference=use_reference,
+            version=version,
+        )