Source code for alomancy.core.base_active_learning

from abc import ABC, abstractmethod
from pathlib import Path

import pandas as pd
from ase import Atoms
from ase.io import read, write

from alomancy.analysis.plotting import mae_al_loop_plot
from alomancy.utils.clean_structures import clean_structures



[docs]
class BaseActiveLearningWorkflow(ABC):
    """
    Abstract base class for active learning workflows.

    This class provides the core AL loop structure while requiring
    subclasses to implement the specific methods for structure generation,
    high-accuracy evaluation, MLIP training, and evaluation.

    Subclasses must implement the following abstract methods:
    - `high_accuracy_evaluation`
    - `train_mlip`
    - `generate_structures`


    """

    def __init__(
        self,
        initial_train_file_path: str,
        initial_test_file_path: str,
        jobs_dict: dict,
        number_of_al_loops: int = 5,
        verbose: int = 0,
        start_loop: int = 0,
        plots: bool = True,
    ):
        self.initial_train_file = Path(initial_train_file_path)
        self.initial_test_file = Path(initial_test_file_path)
        self.jobs_dict = jobs_dict
        self.number_of_al_loops = number_of_al_loops
        self.verbose = verbose
        self.start_loop = start_loop
        self.plots = plots


[docs]
    def run(self, **kwargs) -> None:
        """
        Run the active learning workflow.

        This method defines the core AL loop and calls the abstract methods
        that must be implemented by subclasses.
        """

        def load_initial_train_test_sets(
            dummy_run: bool = False,
        ) -> tuple[list[Atoms], list[Atoms]]:
            train_xyzs = [
                atoms
                for atoms in read(self.initial_train_file, ":")
                if isinstance(atoms, Atoms)
            ]
            test_xyzs = [
                atoms
                for atoms in read(self.initial_test_file, ":")
                if isinstance(atoms, Atoms)
            ]

            assert len(train_xyzs) > 1, "More than one training structure required."
            assert len(test_xyzs) > 1, "More than one test structure required."

            if dummy_run:
                train_xyzs = train_xyzs[:500]
                test_xyzs = test_xyzs[:200]

            return train_xyzs, test_xyzs

        train_xyzs, test_xyzs = load_initial_train_test_sets(dummy_run=False)

        for loop in range(self.start_loop, self.number_of_al_loops):
            base_name = f"al_loop_{loop}"
            workdir = Path(f"results/{base_name}")

            # Ensure directory exists before writing files
            try:
                workdir.mkdir(exist_ok=True, parents=True)
            except OSError as e:
                print(f"Warning: Could not create directory {workdir}: {e}")
                # Continue anyway - might be a permissions issue in tests

            train_file = Path(workdir, "train_set.xyz")
            test_file = Path(workdir, "test_set.xyz")

            # Write current training and test sets with error handling
            try:
                write(train_file, train_xyzs, format="extxyz")
                write(test_file, test_xyzs, format="extxyz")
            except OSError as e:
                if "test" not in str(e).lower():  # Don't fail in tests
                    raise
                print(f"Warning: Could not write files (test environment): {e}")

            if self.verbose > 0:
                print(f"Starting AL loop {loop}")
                print(f"  Training set size: {len(train_xyzs)}")
                print(f"  Test set size: {len(test_xyzs)}")

            # Core AL loop steps - these methods must be implemented by subclasses
            evaluation_results = self.train_mlip(
                base_name, self.jobs_dict["mlip_committee"], **kwargs
            )

            if self.verbose > 0:
                print(f"AL Loop {loop} evaluation results: \n{evaluation_results}")

            if self.plots:
                mae_al_loop_plot(evaluation_results, self.jobs_dict["mlip_committee"])

            generated_structures = self.generate_structures(
                base_name, self.jobs_dict, train_xyzs, **kwargs
            )

            new_training_data = self.high_accuracy_evaluation(
                base_name,
                self.jobs_dict["high_accuracy_evaluation"],
                generated_structures,
                **kwargs,
            )

            train_xyzs += clean_structures(
                new_training_data, base_name, self.jobs_dict["high_accuracy_evaluation"]
            )

            if self.verbose > 0:
                print(
                    f"Completed AL loop {loop}, retraining with {len(train_xyzs)} structures."
                )



[docs]
    def process_structure(self, structure: Atoms) -> Atoms:
        """
        Process a structure before adding it to the training set.

        We will

        Parameters
        ----------
        structure : Atoms
            The structure to process.

        Returns
        -------
        Atoms
            The processed structure.
        """
        new_structure = structure.copy()
        new_structure.info["REF_energy"] = structure.get_potential_energy()
        new_structure.arrays["REF_forces"] = structure.get_forces()

        return new_structure



[docs]
    @abstractmethod
    def high_accuracy_evaluation(
        self,
        base_name: str,
        high_accuracy_eval_job_dict: dict,
        structures: list[Atoms],
        **kwargs,
    ) -> list[Atoms]:
        """
        Run high-accuracy calculations on selected structures.

        Parameters
        ----------
        base_name : str
            Base name for this AL loop
        high_accuracy_eval_job_dict : dict
            Dictionary containing job name and HPC parameters for high-accuracy evaluation
        structures : List[Atoms]
            Structures to evaluate with high-accuracy method
        **kwargs
            Additional keyword arguments

        Returns
        -------
        List[Atoms]
            Structures with high-accuracy results (energy, forces, etc.)
        """
        pass



[docs]
    @abstractmethod
    def train_mlip(
        self, base_name: str, mlip_committee_job_dict: dict, **kwargs
    ) -> pd.DataFrame:
        """
        Train machine learning interatomic potential.

        Parameters
        ----------
        base_name : str
            Base name for this AL loop
        mlip_committee_job_dict : dict
            Dictionary containing job name and HPC parameters for MLIP training
        train_data : List[Atoms]
            Training data for MLIP
        **kwargs
            Additional keyword arguments

        Returns
        -------
        Optional[str]
            Path to trained model file, if applicable
        """
        pass



[docs]
    @abstractmethod
    def generate_structures(
        self,
        base_name: str,
        structure_generation_job_dict: dict,
        train_data: list[Atoms],
        **kwargs,
    ) -> list[Atoms]:
        """
        Generate structures for active learning selection.

        Parameters
        ----------
        base_name : str
            Base name for this AL loop
        structure_generation_job_dict : dict
            Dictionary containing job name and HPC parameters for structure generation
        train_data : List[Atoms]
            Current training data
        **kwargs
            Additional keyword arguments

        Returns
        -------
        List[Atoms]
            Generated structures for high-accuracy evaluation
        """
        pass