Source code for linchemin.rem.graph_distance

import abc
import multiprocessing as mp
from dataclasses import dataclass
from functools import partial
from typing import List, Union

import networkx as nx
import numpy as np
import pandas as pd

import linchemin.cheminfo.functions as cif
from linchemin import settings
from linchemin.cgu.syngraph import BipartiteSynGraph, MonopartiteReacSynGraph
from linchemin.cgu.translate import translator
from linchemin.cheminfo.chemical_similarity import (
    compute_mol_fingerprint,
    compute_reaction_fingerprint,
    compute_similarity,
)
from linchemin.cheminfo.models import ChemicalEquation, Molecule
from linchemin.configuration.defaults import DEFAULT_GED
from linchemin.utilities import console_logger

"""
Module containing classes and functions
to compute the similarity between pairs of routes.
"""

logger = console_logger(__name__)


class GraphDistanceError(Exception):
    """Base class for exceptions leading to unsuccessful distance calculation."""

    pass


class UnavailableGED(GraphDistanceError):
    """Raised if the selected method to compute
    the graph distance is not among the available ones."""

    pass


class MismatchingGraph(GraphDistanceError):
    """Raised if the input graphs are of different types"""

    pass


class TooFewRoutes(GraphDistanceError):
    """Raised if fewer than 2 routes are passed when computing the distance matrix"""

    pass


@dataclass
class ChemicalSimilarityParameters:
    reaction_fingerprint: str = settings.GED.reaction_fp
    reaction_fp_params: Union[dict, None] = settings.GED.reaction_fp_params
    reaction_similarity: str = settings.GED.reaction_similarity_name
    molecular_fingerprint: str = settings.GED.molecular_fp
    molecular_fp_params: Union[dict, None] = settings.GED.molecular_fp_params
    molecular_fp_count_vect: bool = settings.GED.molecular_fp_count_vect
    molecular_similarity_name: str = settings.GED.molecular_similarity_name



[docs]
class Ged(metaclass=abc.ABCMeta):
    """Abstract class for Ged calculators."""


[docs]
    @abc.abstractmethod
    def compute_ged(
        self,
        syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        ged_params: ChemicalSimilarityParameters,
    ) -> float:
        """
        To calculate the Graph Edit Distance for a pair of graphs.

        Parameters:
        ------------
        syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph]
            The first graph
        syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph]
            The second graph
        ged_params: ChemicalSimilarityParameters
            It contains the parameters to be used in the chemical similarity calculation

        Returns:
        ---------
        ged: float
            The value of the GED
        """
        pass




class GedOptNx(Ged):
    """Subclass for the calculation of the
    optimized GED algorithm as implemented in NetworkX."""

    def compute_ged(
        self,
        syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        ged_params: ChemicalSimilarityParameters,
    ) -> float:
        """Takes two SynGraph instances, fingerprints and similarity methods for both molecules
        and reactions and returns the GED between the two graphs as computed by the optimized GED algorithm in
        NetworkX."""  # noqa: E501
        if isinstance(syngraph1, MonopartiteReacSynGraph) and isinstance(
            syngraph2, MonopartiteReacSynGraph
        ):
            out_data_model = "monopartite_reactions"

        elif isinstance(syngraph1, BipartiteSynGraph) and isinstance(
            syngraph2, BipartiteSynGraph
        ):
            out_data_model = "bipartite"

        else:
            logger.error(
                f"Graph1 has type = {type(syngraph1)}"
                f"Graph2 has type = {type(syngraph2)}. "
                f"The GED cannot be computed between graph of different types."
            )
            raise MismatchingGraph
        nx_graphs = [
            translator("syngraph", s, "networkx", out_data_model=out_data_model)
            for s in [syngraph1, syngraph2]
        ]
        # The cost function uses the selected reaction
        # and molecular fingerprints and the selected similarity type.
        node_subst_cost_partial = partial(
            node_subst_cost,
            reaction_fingerprints=ged_params.reaction_fingerprint,
            reaction_fp_params=ged_params.reaction_fp_params,
            reaction_similarity_name=ged_params.reaction_similarity,
            molecular_fingerprint=ged_params.molecular_fingerprint,
            molecular_fp_params=ged_params.molecular_fp_params,
            molecular_fp_count_vect=ged_params.molecular_fp_count_vect,
            molecular_similarity_name=ged_params.molecular_similarity_name,
        )

        opt_ged = nx.optimize_graph_edit_distance(
            nx_graphs[0], nx_graphs[1], node_subst_cost=node_subst_cost_partial
        )

        for g in opt_ged:
            min_g = g
        return min_g


class GedNxPrecomputedMatrix(Ged):
    """Subclass for the calculation of the GED algorithm as implemented in NetworkX; the chemical similarity between
    nodes is precomputed."""  # noqa: E501

    def compute_ged(
        self,
        syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        ged_params: ChemicalSimilarityParameters,
    ) -> float:
        """Takes two SynGraph instances, fingerprints and similarity methods for both molecules
        and reactions and returns the GED between the two graphs as computed by the GED algorithm in NetworkX.
        The similarity matrix between nodes in the involved graphs is precomputed.
        """  # noqa: E501
        if isinstance(syngraph1, MonopartiteReacSynGraph) and isinstance(
            syngraph2, MonopartiteReacSynGraph
        ):
            reaction_similarity_matrix = self.precompute_reaction_similarity_matrix(
                syngraph1, syngraph2, ged_params
            )
            out_data_model = "monopartite_reactions"

            # The cost function uses the selected reaction fingerprints.
            node_subst_cost_partial = partial(
                node_subst_cost_matrix,
                reaction_similarity_matrix=reaction_similarity_matrix,
                molecule_similarity_matrix=None,
            )

        elif isinstance(syngraph1, BipartiteSynGraph) and isinstance(
            syngraph2, BipartiteSynGraph
        ):
            reaction_similarity_matrix = self.precompute_reaction_similarity_matrix(
                syngraph1,
                syngraph2,
                ged_params,
            )
            molecule_similarity_matrix = self.precompute_molecule_similarity_matrix(
                syngraph1, syngraph2, ged_params
            )
            out_data_model = "bipartite"

            # The cost function uses the selected reaction
            # and molecular fingerprints and the selected similarity type.
            node_subst_cost_partial = partial(
                node_subst_cost_matrix,
                reaction_similarity_matrix=reaction_similarity_matrix,
                molecule_similarity_matrix=molecule_similarity_matrix,
            )
        else:
            logger.error(
                f"Graph1 has type = {type(syngraph1)}"
                f"Graph2 has type = {type(syngraph2)}. "
                f"The GED cannot be computed between graph of different types."
            )
            raise MismatchingGraph
        nx_graphs = [
            translator("syngraph", s, "networkx", out_data_model=out_data_model)
            for s in [syngraph1, syngraph2]
        ]
        # Retrieve the roots of the routes
        root_g1 = next(n for n, d in nx_graphs[0].out_degree() if d == 0)
        root_g2 = next(n for n, d in nx_graphs[1].out_degree() if d == 0)

        return nx.graph_edit_distance(
            nx_graphs[0],
            nx_graphs[1],
            node_subst_cost=node_subst_cost_partial,
            roots=(root_g1, root_g2),
        )

    @staticmethod
    def precompute_reaction_similarity_matrix(
        syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        ged_params: ChemicalSimilarityParameters,
    ) -> pd.DataFrame:
        """To precompute the similarity matrix for ChemicalEquation nodes only"""
        d_reactions1 = get_reactions_fp_dict(
            syngraph1, ged_params.reaction_fingerprint, ged_params.reaction_fp_params
        )

        d_reactions2 = get_reactions_fp_dict(
            syngraph2, ged_params.reaction_fingerprint, ged_params.reaction_fp_params
        )

        return build_similarity_matrix(
            d_reactions1, d_reactions2, ged_params.reaction_similarity
        )

    @staticmethod
    def precompute_molecule_similarity_matrix(
        syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        ged_params: ChemicalSimilarityParameters,
    ) -> pd.DataFrame:
        """To precompute the similarity matrix for ChemicalEquation nodes only"""
        d_mol1 = get_mol_fp_dict(
            syngraph1,
            ged_params.molecular_fingerprint,
            ged_params.molecular_fp_params,
            ged_params.molecular_fp_count_vect,
        )
        d_mol2 = get_mol_fp_dict(
            syngraph2,
            ged_params.molecular_fingerprint,
            ged_params.molecular_fp_params,
            ged_params.molecular_fp_count_vect,
        )

        return build_similarity_matrix(d_mol1, d_mol2, ged_params.reaction_similarity)


class GedNx(Ged):
    """Subclass for the calculation of the GED algorithm as implemented in NetworkX."""

    def compute_ged(
        self,
        syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
        ged_params: ChemicalSimilarityParameters,
    ) -> float:
        """Takes two SynGraph instances, fingerprints and similarity methods for both molecules
        and reactions and returns the GED between the two graphs as computed by the GED algorithm in NetworkX.
        """  # noqa: E501
        if isinstance(syngraph1, MonopartiteReacSynGraph) and isinstance(
            syngraph2, MonopartiteReacSynGraph
        ):
            out_data_model = "monopartite_reactions"

        elif isinstance(syngraph1, BipartiteSynGraph) and isinstance(
            syngraph2, BipartiteSynGraph
        ):
            out_data_model = "bipartite"

        else:
            logger.error(
                f"Graph1 has type = {type(syngraph1)} "
                f"Graph2 has type = {type(syngraph2)}. "
                f"The GED cannot be computed between graph of different types."
            )
            raise MismatchingGraph
        nx_graphs = [
            translator("syngraph", s, "networkx", out_data_model=out_data_model)
            for s in [syngraph1, syngraph2]
        ]
        # The cost function uses the selected reaction
        # and molecular fingerprints and the selected similarity type.
        node_subst_cost_partial = partial(
            node_subst_cost,
            reaction_fingerprints=ged_params.reaction_fingerprint,
            reaction_fp_params=ged_params.reaction_fp_params,
            reaction_similarity_name=ged_params.reaction_similarity,
            molecular_fingerprint=ged_params.molecular_fingerprint,
            molecular_fp_params=ged_params.molecular_fp_params,
            molecular_fp_count_vect=ged_params.molecular_fp_count_vect,
            molecular_similarity_name=ged_params.molecular_similarity_name,
        )
        # Retrieve the roots of the routes
        root_g1 = next(n for n, d in nx_graphs[0].out_degree if d == 0)
        root_g2 = next(n for n, d in nx_graphs[1].out_degree if d == 0)

        return nx.graph_edit_distance(
            nx_graphs[0],
            nx_graphs[1],
            node_subst_cost=node_subst_cost_partial,
            roots=(root_g1, root_g2),
        )



[docs]
class GedFactory:
    """GED Factory to give access to the GED calculators.

    Attributes:
    -----------
    available_ged: a dictionary
        It maps the strings representing the 'name' of a GED algorithm to the correct Ged subclass
    """  # noqa: E501

    available_ged = {
        "nx_ged": {
            "value": GedNx,
            "info": 'Standard NetworkX GED algorithm. The "root" argument is used',
        },
        "nx_ged_matrix": {
            "value": GedNxPrecomputedMatrix,
            "info": "Standard NetworkX GED algorithm. "
            "The distance matrix is computed in advance"
            'and the "root" algorithm is used',
        },
        "nx_optimized_ged": {
            "value": GedOptNx,
            "info": "Optimized NetworkX GED algorithm",
        },
    }

    def select_ged(
        self,
        syngraph1,
        syngraph2,
        ged_method,
        ged_params: ChemicalSimilarityParameters,
        # reaction_fp,
        # reaction_fp_params,
        # reaction_similarity_name,
        # molecular_fp,
        # molecular_fp_params,
        # molecular_fp_count_vect,
        # molecular_similarity_name,
    ):
        if ged_method not in self.available_ged:
            logger.error(
                f"'{ged_method}' is invalid. "
                f"Available algorithms are: {self.available_ged.keys()}"
            )
            raise UnavailableGED

        selector = self.available_ged[ged_method]["value"]
        return selector().compute_ged(syngraph1, syngraph2, ged_params)




[docs]
def graph_distance_factory(
    syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
    syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
    ged_method: str,
    ged_params: Union[dict, None] = None,
) -> float:
    """
    To compute the graph edit distance between 2 SynGraph objects

    Parameters:
    -----------
    syngraph1: Union[MonopartiteReacSynGraph, BipartiteSynGraph]
        One of the input graphs
    syngraph2: Union[MonopartiteReacSynGraph, BipartiteSynGraph]
        Another input graph
    ged_method: str
        The graph edit distance algorithm to be used
    ged_params: Union[dict, None]
        It contains the optional parameters for chemical similarity calculations, which are:
        (i) reaction_fp: a string corresponding to the type of fingerprints to be used for reactions
        (ii) reaction_fp_params: a dictionary with the optional parameters for computing reaction fingerprints
        (iii) reaction_similarity_name: a string corresponding to the similarity type to be used for reactions
        (iv) molecular_fp: a string corresponding to the type of fingerprints to be used for molecules
        (v) molecular_fp_params: a dictionary with the optional parameters for computing molecular fingerprints
        (vi) molecular_fp_count_vect: a boolean indicating whether 'GetCountFingerprint' should be used
        (vii) molecular_similarity_name: a string corresponding to the similarity type to be used for molecules
        If it is not provided, the default parameters are used (default None)

    Returns:
    ------
    ged: float
        The ged between the two input graphs

    Example:
    -------
    >>> graphs = json.loads(open('ibm_file.json').read())
    >>> syngraphs = [translator('ibm_retro', g, 'syngraph', out_data_model='bipartite') for g in graphs]
    >>> ged = graph_distance_factory(syngraphs[0], syngraphs[3], ged_method='nx_ged')
    """  # noqa: E501
    params = set_chemical_similarity_parameters(ged_params)
    ged_calculator = GedFactory()
    return ged_calculator.select_ged(syngraph1, syngraph2, ged_method, params)



def set_chemical_similarity_parameters(
    ged_params: dict,
) -> ChemicalSimilarityParameters:
    """To set the instance of ChemicalSimilarityParameters
    with the desired parameters"""
    params = ChemicalSimilarityParameters()
    if ged_params is None:
        return params
    params.reaction_fingerprint = ged_params.get(
        "reaction_fp", settings.GED.reaction_fp
    )
    params.reaction_fp_params = ged_params.get(
        "reaction_fp_params", settings.GED.reaction_fp_params
    )
    params.reaction_similarity_name = ged_params.get(
        "reaction_similarity_name", settings.GED.reaction_similarity_name
    )
    params.molecular_fingerprint = ged_params.get(
        "molecular_fp", settings.GED.molecular_fp
    )
    params.molecular_fp_params = ged_params.get(
        "molecular_fp_params", settings.GED.molecular_fp_params
    )
    params.molecular_fp_count_vect = ged_params.get(
        "molecular_fp_count_vect", settings.GED.molecular_fp_count_vect
    )
    params.molecular_similarity_name = ged_params.get(
        "molecular_similarity_name", settings.GED.molecular_similarity_name
    )
    return params


# COST FUNCTIONS
def node_subst_cost_matrix(
    node1,
    node2,
    reaction_similarity_matrix: pd.DataFrame,
    molecule_similarity_matrix: Union[pd.DataFrame, None],
):
    """To compute the cost of substituting ona node with another, based on the pre-computed similarity matrices.
    The more different the nodes, the higher the cost.
    """  # noqa: E501
    # The correct similarity matrix is used based on the node types
    if isinstance(node1["properties"]["node_type"], ChemicalEquation) and isinstance(
        node2["properties"]["node_type"], ChemicalEquation
    ):
        similarity = reaction_similarity_matrix.loc[
            node2["properties"]["node_type"].uid, node1["properties"]["node_type"].uid
        ]
        return 1.0 - similarity

    elif isinstance(node1["properties"]["node_type"], Molecule) and isinstance(
        node2["properties"]["node_type"], Molecule
    ):
        similarity = molecule_similarity_matrix.loc[
            node2["properties"]["node_type"].uid, node1["properties"]["node_type"].uid
        ]
        return 1.0 - similarity

    else:
        return 1.0


def node_subst_cost(
    node1,
    node2,
    reaction_fingerprints,
    reaction_fp_params,
    reaction_similarity_name,
    molecular_fingerprint,
    molecular_fp_params,
    molecular_fp_count_vect,
    molecular_similarity_name,
) -> float:
    """To compute the cost of substituting one node with another, based on the selected fingerprints/similarity.
    The more different the nodes, the higher the cost.

    Returns:
    ---------
    cost: float
        The cost of the substitution (between 0 and 1)
    """  # noqa: E501
    # If both nodes are ChemicalEquation, their similarity is computed
    if isinstance(node1["properties"]["node_type"], ChemicalEquation) and isinstance(
        node2["properties"]["node_type"], ChemicalEquation
    ):
        return get_reaction_similarity(
            node1["properties"]["node_type"].rdrxn,
            node2["properties"]["node_type"].rdrxn,
            reaction_fingerprints,
            reaction_fp_params,
            reaction_similarity_name,
        )
    # if both nodes are MoleculeEquation, their similarity is computed
    elif isinstance(node1["properties"]["node_type"], Molecule) and isinstance(
        node2["properties"]["node_type"], Molecule
    ):
        return get_molecular_similarity(
            node1["properties"]["node_type"].rdmol,
            node2["properties"]["node_type"].rdmol,
            molecular_fingerprint,
            molecular_fp_params,
            molecular_similarity_name,
            molecular_fp_count_vect,
        )
    # if the two nodes are of different types, the maximum diversity is returned
    else:
        return 1.0


def get_reaction_similarity(
    rdrxn1: cif.rdChemReactions,
    rdrxn2: cif.rdChemReactions,
    reaction_fingerprint,
    reaction_fp_params,
    reaction_similarity,
) -> float:
    """To compute the similarity between two reactions"""
    fp1 = compute_reaction_fingerprint(
        rdrxn1,
        fp_name=reaction_fingerprint,
        params=reaction_fp_params,
    )
    fp2 = compute_reaction_fingerprint(
        rdrxn2,
        fp_name=reaction_fingerprint,
        params=reaction_fp_params,
    )
    similarity = compute_similarity(fp1, fp2, similarity_name=reaction_similarity)
    return 1.0 - similarity


def get_molecular_similarity(
    rdmol1: cif.rdChemReactions,
    rdmol2: cif.rdChemReactions,
    molecular_fingerprint,
    molecular_fp_params,
    molecular_similarity_name,
    molecular_fp_count_vect,
) -> float:
    """To compute the similarity between two molecules"""
    fp1 = compute_mol_fingerprint(
        rdmol1,
        fp_name=molecular_fingerprint,
        parameters=molecular_fp_params,
        count_fp_vector=molecular_fp_count_vect,
    )
    fp2 = compute_mol_fingerprint(
        rdmol2,
        fp_name=molecular_fingerprint,
        parameters=molecular_fp_params,
        count_fp_vector=molecular_fp_count_vect,
    )
    similarity = compute_similarity(fp1, fp2, similarity_name=molecular_similarity_name)
    return 1.0 - similarity


def get_mol_fp_dict(
    syngraph: BipartiteSynGraph,
    molecular_fingerprint: str,
    molecular_fp_params=DEFAULT_GED["molecular_fp_params"]["value"],
    molecular_fp_count_vect=DEFAULT_GED["molecular_fp_count_vect"]["value"],
) -> dict:
    """
    To build a dictionary, whose keys are the hashes of the Molecule nodes in a SynGraph and the values their fingerprints.

    Parameters:
    ------------
    syngraph: BipartiteSynGraph
        The graph object for whose nodes fingerprints should be computed
    molecular_fingerprint: str
        The selected type of molecular fingerprint
    molecular_fp_params: dict
        The optional parameters for computing molecular fingerprints
    molecular_fp_count_vect: bool
        Whether 'GetCountFingerprint' should be used

    Returns:
    ----------
    molecule_node_fingerprints: dict
        The fingerprints of the Molecule nodes
    """  # noqa: E501

    molecules = get_molecule_nodes(syngraph)

    return {
        mol.uid: compute_mol_fingerprint(
            mol.rdmol,
            molecular_fingerprint,
            molecular_fp_params,
            molecular_fp_count_vect,
        )
        for mol in molecules
    }


def get_reactions_fp_dict(
    syngraph: Union[MonopartiteReacSynGraph, BipartiteSynGraph],
    reaction_fingerprints: str,
    reaction_fp_params=DEFAULT_GED["reaction_fp_params"]["value"],
) -> dict:
    """
    To build a dictionary, whose keys are the hashes of the ChemicalEquation nodes in a SynGraph and the values their fingerprints.

    Parameters:
    ------------
    syngraph: Union[MonopartiteReacSynGraph, BipartiteSynGraph]
        The graph object for whose nodes fingerprints should be computed
    reaction_fingerprints: str
        The selected type of reaction fingerprint
    reaction_fp_params: dict
        The optional parameters for computing reaction fingerprints


    Returns:
    ----------
    reaction_node_fingerprints: dict
        The fingerprints of the ChemicalEquation nodes
    """  # noqa: E501
    reactions = get_chemical_equation_nodes(syngraph)
    return {
        ce.uid: compute_reaction_fingerprint(
            ce.rdrxn, reaction_fingerprints, reaction_fp_params
        )
        for ce in reactions
    }


def get_chemical_equation_nodes(
    syngraph: Union[MonopartiteReacSynGraph, BipartiteSynGraph]
) -> set:
    """To extract all ChemicalEquation nodes from a SynGraph"""
    reactions = set()
    for parent, children in syngraph.graph.items():
        if isinstance(parent, ChemicalEquation):
            reactions.add(parent)
            [
                reactions.add(child)
                for child in children
                if isinstance(child, ChemicalEquation)
            ]
    return reactions


def get_molecule_nodes(syngraph: BipartiteSynGraph) -> set:
    """To extract all Molecule nodes from a SynGraph"""
    molecules = set()
    for parent, children in syngraph.graph.items():
        if isinstance(parent, Molecule):
            molecules.add(parent)
            [molecules.add(child) for child in children if isinstance(child, Molecule)]
    return molecules


def build_similarity_matrix(
    d_fingerprints1: dict,
    d_fingerprints2: dict,
    similarity_name: str = settings.GED.molecular_similarity_name,
) -> pd.DataFrame:
    """
    To build the similarity matrix between two routes with the selected method.

    Parameters:
    ------------
    d_fingerprints1: dict
        The fingerprint of the first graph to be considered in the form {hash: fingerprints}
    d_fingerprints2: dict
        The fingerprint of the second graph to be considered in the form {hash: fingerprints}
    similarity_name: str
        The similarity method to be used

    Returns:
    ---------
    matrix: pd.DataFrame
        a pandas dataframe (n nodes in graph1) x (n nodes in graph2) containing the similarity values
    """  # noqa: E501
    columns = list(d_fingerprints1.keys())
    rows = list(d_fingerprints2.keys())
    matrix = pd.DataFrame(
        np.zeros((len(rows), len(columns))), columns=columns, index=rows
    )

    for h1, fp1 in d_fingerprints1.items():
        for h2, fp2 in d_fingerprints2.items():
            if matrix.loc[h2, h1] == 0:
                sim = compute_similarity(fp1, fp2, similarity_name=similarity_name)
                matrix.loc[h2, h1] = sim
    return matrix



[docs]
def compute_distance_matrix(
    syngraphs: List[Union[MonopartiteReacSynGraph, BipartiteSynGraph]],
    ged_method: str,
    ged_params: Union[dict, None] = None,
    parallelization: bool = False,
    n_cpu=settings.GED.n_cpu,
) -> pd.DataFrame:
    """
    To compute the distance matrix of a set of routes.

    Parameters:
    -----------
    syngraphs: List[Union[MonopartiteReacSynGraph, BipartiteSynGraph]]
        The routes for which the distance matrix must be computed
    ged_method: str
        The graph edit distance method to be used
    ged_params: Optional[Union[dict, None]]
        The dictionary containing the parameters for fingerprints and similarity calculations; if it is not provided,
        the default values are used (default None)
    parallelization: Optional[bool]
        Whether parallelization should be used (default False)
    n_cpu: Optional[int]
        If parallelization is activated, it indicates the number of CPUs to be used (default 8)

    Returns:
    --------
    matrix: a pandas DataFrame
        The distance matrix, with dimensions (n routes x n routes), with the graph distances

    Example:
    --------
    >>> graph = json.loads(open('az_file.json').read())
    >>> mp_syngraphs = [translator('az_retro', g, 'syngraph', out_data_model='monopartite_reactions') for g in graph]
    >>> m = compute_distance_matrix(mp_syngraphs, ged_method='nx_ged')
    """  # noqa: E501

    if len(syngraphs) < 2:
        logger.error(
            "Less than 2 routes were found: "
            "it is not possible to compute the distance matrix"
        )
        raise TooFewRoutes

    # Calculation with parallelization
    if parallelization:
        matrix = setup_parallel_calculation(syngraphs, n_cpu, ged_method, ged_params)

    else:
        n_routes = len(syngraphs)
        matrix = pd.DataFrame(columns=range(n_routes), index=range(n_routes))
        for i in range(n_routes):
            for j in range(i, n_routes):
                sim = graph_distance_factory(
                    syngraphs[i],
                    syngraphs[j],
                    ged_method=ged_method,
                    ged_params=ged_params,
                )
                matrix.loc[j, i] = sim
                matrix.loc[i, j] = sim
    return matrix



def setup_parallel_calculation(
    syngraphs: list, n_cpu: int, ged_method: str, ged_params: dict
) -> pd.DataFrame:
    """To setup the matrix calculation with parallelization"""
    routes_range = range(len(syngraphs))

    results = []
    pool = mp.Pool(n_cpu)
    for i in routes_range:
        in_routes = [(i, j, syngraphs[i], syngraphs[j]) for j in routes_range if j >= i]
        results.append(
            pool.starmap(
                parallel_matrix_calculations,
                [(tup, ged_method, ged_params) for tup in in_routes],
            )
        )
    pool.close()
    return build_ged_matrix(results, routes_range)


def parallel_matrix_calculations(
    data: tuple, ged_method: str, ged_params: dict
) -> tuple:
    """
    To compute the distance matrix elements in a fashion suitable for parallel computation.

    Parameters:
    -----------
    data: tuple
        It contains the two indices and the two routes for computing an element of the distance matrix
        (i, j, route1, route2)
    ged_method: str
        The graph edit distance method to be used
    ged_params: dict
        The optional parameters for the ged calculation

    Returns:
    --------
    i, j, sim: tuple
            Two indices of the matrix and the relative distance value
    """  # noqa: E501
    (i, j, r1, r2) = data
    sim = graph_distance_factory(r1, r2, ged_method=ged_method, ged_params=ged_params)
    return i, j, sim


def build_ged_matrix(results: list, routes_range: range) -> pd.DataFrame:
    """To build the distance matrix from a list of lists"""
    matrix = pd.DataFrame(columns=routes_range, index=routes_range)
    for result in results:
        for t in result:
            matrix.loc[t[1], t[0]] = t[2]
            matrix.loc[t[0], t[1]] = t[2]
    return matrix


# Helper functions
def get_available_ged_algorithms() -> dict:
    """Returns a dictionary with the available GED algorithms and some info"""
    return {
        f: additional_info["info"]
        for f, additional_info in GedFactory.available_ged.items()
    }


def get_ged_default_parameters() -> dict:
    """Returns a dictionary with the default parameters used in GED calculation"""
    return {f: additional_info["info"] for f, additional_info in DEFAULT_GED.items()}


def get_ged_parameters() -> dict:
    """Returns a dictionary with the default parameters used in GED calculations"""
    return {
        f: additional_info["general_info"] for f, additional_info in DEFAULT_GED.items()
    }