smolagents/tests/test_examples.py

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ast
import os
import re
import shutil
import tempfile
import unittest
from pathlib import Path
from unittest import mock, skip

import torch

from accelerate.test_utils.examples import compare_against_test
from accelerate.test_utils.testing import (
    TempDirTestCase,
    get_launch_command,
    require_huggingface_suite,
    require_multi_device,
    require_multi_gpu,
    require_non_xpu,
    require_pippy,
    require_schedulefree,
    require_trackers,
    run_command,
    slow,
)
from accelerate.utils import write_basic_config


# DataLoaders built from `test_samples/MRPC` for quick testing
# Should mock `{script_name}.get_dataloaders` via:
# @mock.patch("{script_name}.get_dataloaders", mocked_dataloaders)

EXCLUDE_EXAMPLES = [
    "cross_validation.py",
    "checkpointing.py",
    "gradient_accumulation.py",
    "local_sgd.py",
    "multi_process_metrics.py",
    "memory.py",
    "schedule_free.py",
    "tracking.py",
    "automatic_gradient_accumulation.py",
    "fsdp_with_peak_mem_tracking.py",
    "deepspeed_with_config_support.py",
    "megatron_lm_gpt_pretraining.py",
    "early_stopping.py",
    "ddp_comm_hook.py",
    "profiler.py",
]


class ExampleDifferenceTests(unittest.TestCase):
    """
    This TestCase checks that all of the `complete_*` scripts contain all of the
    information found in the `by_feature` scripts, line for line. If one fails,
    then a complete example does not contain all of the features in the features
    scripts, and should be updated.

    Each example script should be a single test (such as `test_nlp_example`),
    and should run `one_complete_example` twice: once with `parser_only=True`,
    and the other with `parser_only=False`. This is so that when the test
    failures are returned to the user, they understand if the discrepancy lies in
    the `main` function, or the `training_loop` function. Otherwise it will be
    unclear.

    Also, if there are any expected differences between the base script used and
    `complete_nlp_example.py` (the canonical base script), these should be included in
    `special_strings`. These would be differences in how something is logged, print statements,
    etc (such as calls to `Accelerate.log()`)
    """

    by_feature_path = Path("examples", "by_feature").resolve()
    examples_path = Path("examples").resolve()

    def one_complete_example(
        self, complete_file_name: str, parser_only: bool, secondary_filename: str = None, special_strings: list = None
    ):
        """
        Tests a single `complete` example against all of the implemented `by_feature` scripts

        Args:
            complete_file_name (`str`):
                The filename of a complete example
            parser_only (`bool`):
                Whether to look at the main training function, or the argument parser
            secondary_filename (`str`, *optional*):
                A potential secondary base file to strip all script information not relevant for checking,
                such as "cv_example.py" when testing "complete_cv_example.py"
            special_strings (`list`, *optional*):
                A list of strings to potentially remove before checking no differences are left. These should be
                diffs that are file specific, such as different logging variations between files.
        """
        self.maxDiff = None
        for item in os.listdir(self.by_feature_path):
            if item not in EXCLUDE_EXAMPLES:
                item_path = self.by_feature_path / item
                if item_path.is_file() and item_path.suffix == ".py":
                    with self.subTest(
                        tested_script=complete_file_name,
                        feature_script=item,
                        tested_section="main()" if parser_only else "training_function()",
                    ):
                        diff = compare_against_test(
                            self.examples_path / complete_file_name, item_path, parser_only, secondary_filename
                        )
                        diff = "\n".join(diff)
                        if special_strings is not None:
                            for string in special_strings:
                                diff = diff.replace(string, "")
                        assert diff == ""

    def test_nlp_examples(self):
        self.one_complete_example("complete_nlp_example.py", True)
        self.one_complete_example("complete_nlp_example.py", False)

    def test_cv_examples(self):
        cv_path = (self.examples_path / "cv_example.py").resolve()
        special_strings = [
            " " * 16 + "{\n\n",
            " " * 20 + '"accuracy": eval_metric["accuracy"],\n\n',
            " " * 20 + '"f1": eval_metric["f1"],\n\n',
            " " * 20 + '"train_loss": total_loss.item() / len(train_dataloader),\n\n',
            " " * 20 + '"epoch": epoch,\n\n',
            " " * 16 + "},\n\n",
            " " * 16 + "step=epoch,\n",
            " " * 12,
            " " * 8 + "for step, batch in enumerate(active_dataloader):\n",
        ]
        self.one_complete_example("complete_cv_example.py", True, cv_path, special_strings)
        self.one_complete_example("complete_cv_example.py", False, cv_path, special_strings)


@mock.patch.dict(os.environ, {"TESTING_MOCKED_DATALOADERS": "1"})
@require_huggingface_suite
class FeatureExamplesTests(TempDirTestCase):
    clear_on_setup = False

    @classmethod
    def setUpClass(cls):
        super().setUpClass()
        cls._tmpdir = tempfile.mkdtemp()
        cls.config_file = Path(cls._tmpdir) / "default_config.yml"

        write_basic_config(save_location=cls.config_file)
        cls.launch_args = get_launch_command(config_file=cls.config_file)

    @classmethod
    def tearDownClass(cls):
        super().tearDownClass()
        shutil.rmtree(cls._tmpdir)

    def test_checkpointing_by_epoch(self):
        testargs = f"""
        examples/by_feature/checkpointing.py
        --checkpointing_steps epoch
        --output_dir {self.tmpdir}
        """.split()
        run_command(self.launch_args + testargs)
        assert (self.tmpdir / "epoch_0").exists()

    def test_checkpointing_by_steps(self):
        testargs = f"""
        examples/by_feature/checkpointing.py
        --checkpointing_steps 1
        --output_dir {self.tmpdir}
        """.split()
        _ = run_command(self.launch_args + testargs)
        assert (self.tmpdir / "step_2").exists()

    def test_load_states_by_epoch(self):
        testargs = f"""
        examples/by_feature/checkpointing.py
        --resume_from_checkpoint {self.tmpdir / "epoch_0"}
        """.split()
        output = run_command(self.launch_args + testargs, return_stdout=True)
        assert "epoch 0:" not in output
        assert "epoch 1:" in output

    def test_load_states_by_steps(self):
        testargs = f"""
        examples/by_feature/checkpointing.py
        --resume_from_checkpoint {self.tmpdir / "step_2"}
        """.split()
        output = run_command(self.launch_args + testargs, return_stdout=True)
        if torch.cuda.is_available():
            num_processes = torch.cuda.device_count()
        else:
            num_processes = 1
        if num_processes > 1:
            assert "epoch 0:" not in output
            assert "epoch 1:" in output
        else:
            assert "epoch 0:" in output
            assert "epoch 1:" in output

    @slow
    def test_cross_validation(self):
        testargs = """
        examples/by_feature/cross_validation.py
        --num_folds 2
        """.split()
        with mock.patch.dict(os.environ, {"TESTING_MOCKED_DATALOADERS": "0"}):
            output = run_command(self.launch_args + testargs, return_stdout=True)
            results = re.findall("({.+})", output)
            results = [r for r in results if "accuracy" in r][-1]
            results = ast.literal_eval(results)
            assert results["accuracy"] >= 0.75

    def test_multi_process_metrics(self):
        testargs = ["examples/by_feature/multi_process_metrics.py"]
        run_command(self.launch_args + testargs)

    @require_schedulefree
    def test_schedulefree(self):
        testargs = ["examples/by_feature/schedule_free.py"]
        run_command(self.launch_args + testargs)

    @require_trackers
    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
    def test_tracking(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            testargs = f"""
            examples/by_feature/tracking.py
            --with_tracking
            --project_dir {tmpdir}
            """.split()
            run_command(self.launch_args + testargs)
            assert os.path.exists(os.path.join(tmpdir, "tracking"))

    def test_gradient_accumulation(self):
        testargs = ["examples/by_feature/gradient_accumulation.py"]
        run_command(self.launch_args + testargs)

    def test_local_sgd(self):
        testargs = ["examples/by_feature/local_sgd.py"]
        run_command(self.launch_args + testargs)

    def test_early_stopping(self):
        testargs = ["examples/by_feature/early_stopping.py"]
        run_command(self.launch_args + testargs)

    def test_profiler(self):
        testargs = ["examples/by_feature/profiler.py"]
        run_command(self.launch_args + testargs)

    @require_multi_device
    def test_ddp_comm_hook(self):
        testargs = ["examples/by_feature/ddp_comm_hook.py", "--ddp_comm_hook", "fp16"]
        run_command(self.launch_args + testargs)

    @skip(
        reason="stable-diffusion-v1-5 is no longer available. Potentially `Comfy-Org/stable-diffusion-v1-5-archive` once diffusers support is added."
    )
    @require_multi_device
    def test_distributed_inference_examples_stable_diffusion(self):
        testargs = ["examples/inference/distributed/stable_diffusion.py"]
        run_command(self.launch_args + testargs)

    @require_multi_device
    def test_distributed_inference_examples_phi2(self):
        testargs = ["examples/inference/distributed/phi2.py"]
        run_command(self.launch_args + testargs)

    @require_non_xpu
    @require_pippy
    @require_multi_gpu
    def test_pippy_examples_bert(self):
        testargs = ["examples/inference/pippy/bert.py"]
        run_command(self.launch_args + testargs)

    @require_non_xpu
    @require_pippy
    @require_multi_gpu
    def test_pippy_examples_gpt2(self):
        testargs = ["examples/inference/pippy/gpt2.py"]
        run_command(self.launch_args + testargs)