Skip to content

Commit

Permalink
DOS Fingerprints enhancements (materialsproject#3946)
Browse files Browse the repository at this point in the history
* crude phonon dos comparison

* add wasserstein metric, add dosfingerprint class

* update dos fingerprint test

* add exceptions, add PhononDosFingerprint class

* fix type hints and doc-strings

* fix type hints and doc-strings for phonon fingerprints

* adapt tests with recent changes to src code

* add simple phonon dos comparison test

* move methods and tests to PhononDos class

* fix dos fp exceptions test

* address review comments and adapt tests accordingly

* fix pending doc-strings

---------

Co-authored-by: J. George <JaGeo@users.noreply.github.com>
  • Loading branch information
naik-aakash and JaGeo authored Aug 16, 2024
1 parent 74befc8 commit 532281f
Show file tree
Hide file tree
Showing 4 changed files with 305 additions and 59 deletions.
106 changes: 65 additions & 41 deletions src/pymatgen/electronic_structure/dos.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from scipy.constants import value as _constant
from scipy.ndimage import gaussian_filter1d
from scipy.signal import hilbert
from scipy.stats import wasserstein_distance

from pymatgen.core import Structure, get_el_sp
from pymatgen.core.spectrum import Spectrum
Expand Down Expand Up @@ -647,12 +648,24 @@ def as_dict(self) -> dict[str, Any]:
}


class FingerPrint(NamedTuple):
"""The DOS fingerprint."""
class DosFingerprint(NamedTuple):
"""
Represents a Density of States (DOS) fingerprint.
This named tuple is used to store information related to the Density of States (DOS)
in a material. It includes the energies, densities, type, number of bins, and bin width.
energies: NDArray
densities: NDArray
type: str
Args:
energies: The energy values associated with the DOS.
densities: The corresponding density values for each energy.
fp_type: The type of DOS fingerprint.
n_bins: The number of bins used in the fingerprint.
bin_width: The width of each bin in the DOS fingerprint.
"""

energies: np.ndarray
densities: np.ndarray
fp_type: str
n_bins: int
bin_width: float

Expand Down Expand Up @@ -1191,22 +1204,22 @@ def get_upper_band_edge(

def get_dos_fp(
self,
type: str = "summed_pdos", # noqa: A002
fp_type: str = "summed_pdos",
binning: bool = True,
min_e: float | None = None,
max_e: float | None = None,
n_bins: int = 256,
normalize: bool = True,
) -> FingerPrint:
"""Generate the DOS FingerPrint.
) -> DosFingerprint:
"""Generate the DOS fingerprint.
Based on the work of:
F. Knoop, T. A. r Purcell, M. Scheffler, C. Carbogno, J. Open Source Softw. 2020, 5, 2671.
Source - https://gitlab.com/vibes-developers/vibes/-/tree/master/vibes/materials_fp
Copyright (c) 2020 Florian Knoop, Thomas A.R.Purcell, Matthias Scheffler, Christian Carbogno.
Args:
type (str): The FingerPrint type, can be "{s/p/d/f/summed}_{pdos/tdos}"
fp_type (str): The FingerPrint type, can be "{s/p/d/f/summed}_{pdos/tdos}"
(default is summed_pdos).
binning (bool): Whether to bin the DOS FingerPrint using np.linspace and n_bins.
Default is True.
Expand All @@ -1216,10 +1229,10 @@ def get_dos_fp(
normalize (bool): Whether to normalize the integrated DOS to 1. Default is True.
Raises:
ValueError: If "type" is not one of the accepted values.
ValueError: If "fp_type" is not one of the accepted values.
Returns:
FingerPrint(energies, densities, type, n_bins): The DOS fingerprint.
DosFingerprint(energies, densities, type, n_bins): The DOS fingerprint.
"""
energies = self.energies - self.efermi

Expand All @@ -1237,12 +1250,11 @@ def get_dos_fp(
pdos["tdos"] = self.get_densities()

try:
densities = pdos[type]
densities = pdos[fp_type]
assert densities is not None

if len(energies) < n_bins:
inds = np.where((energies >= min_e) & (energies <= max_e))
return FingerPrint(energies[inds], densities[inds], type, len(energies), np.diff(energies)[0])
return DosFingerprint(energies[inds], densities[inds], fp_type, len(energies), np.diff(energies)[0])

if binning:
ener_bounds = np.linspace(min_e, max_e, n_bins + 1)
Expand All @@ -1267,20 +1279,20 @@ def get_dos_fp(
else:
dos_rebin_sc = dos_rebin

return FingerPrint(np.array([ener]), dos_rebin_sc, type, n_bins, bin_width)
return DosFingerprint(np.array([ener]), dos_rebin_sc, fp_type, n_bins, bin_width)

except KeyError as exc:
raise ValueError(
"Please recheck type requested, either the orbital projections unavailable in input DOS or "
"Please recheck fp_type requested, either the orbital projections unavailable in input DOS or "
"there's a typo in type."
) from exc

@staticmethod
def fp_to_dict(fp: FingerPrint) -> dict[str, NDArray]:
def fp_to_dict(fp: DosFingerprint) -> dict[str, NDArray]:
"""Convert a DOS FingerPrint into a dict.
Args:
fp (FingerPrint): The DOS FingerPrint to convert.
fp (DosFingerprint): The DOS FingerPrint to convert.
Returns:
dict(Keys=type, Values=np.array(energies, densities)): The FingerPrint as dict.
Expand All @@ -1289,53 +1301,65 @@ def fp_to_dict(fp: FingerPrint) -> dict[str, NDArray]:

@staticmethod
def get_dos_fp_similarity(
fp1: FingerPrint,
fp2: FingerPrint,
fp1: DosFingerprint,
fp2: DosFingerprint,
col: int = 1,
pt: int | Literal["All"] = "All",
normalize: bool = False,
tanimoto: bool = False,
metric: Literal["tanimoto", "wasserstein", "cosine-sim"] = "tanimoto",
) -> float:
"""Calculate the similarity index (dot product) of two DOS FingerPrints.
Args:
fp1 (FingerPrint): The 1st DOS FingerPrint.
fp2 (FingerPrint): The 2nd DOS FingerPrint.
col (int): The item in the fingerprints (0: energies, 1: densities)
to take the dot product of (default is 1).
pt (int | "ALL") : The index of the point that the dot product is
to be taken (default is All).
normalize (bool): Whether to normalize the scalar product to 1 (default is False).
tanimoto (bool): Whether to compute Tanimoto index (default is False).
fp1 (DosFingerprint): The 1st dos fingerprint object
fp2 (DosFingerprint): The 2nd dos fingerprint object
col (int): The item in the fingerprints (0:energies,1: densities) to compute
the similarity index of (default is 1)
pt (int or str) : The index of the point that the dot product is to be taken (default is All)
normalize (bool): If True normalize the scalar product to 1 (default is False)
metric (Literal): Metric used to compute similarity default is "tanimoto".
Raises:
ValueError: If both tanimoto and normalize are True.
ValueError: If metric other than tanimoto, wasserstein and "cosine-sim" is requested.
ValueError: If normalize is set to True along with the metric.
Returns:
float: Similarity index given by the dot product.
"""
fp1_dict = fp1 if isinstance(fp1, dict) else CompleteDos.fp_to_dict(fp1)
fp2_dict = fp2 if isinstance(fp2, dict) else CompleteDos.fp_to_dict(fp2)
if metric not in ["tanimoto", "wasserstein", "cosine-sim"]:
raise ValueError(
"Requested metric not implemented. Currently implemented metrics are tanimoto, "
"wasserstien and cosine-sim."
)

fp1_dict = CompleteDos.fp_to_dict(fp1) if not isinstance(fp1, dict) else fp1
fp2_dict = CompleteDos.fp_to_dict(fp2) if not isinstance(fp2, dict) else fp2

if pt == "All":
vec1 = np.array([pt[col] for pt in fp1_dict.values()]).flatten()
vec2 = np.array([pt[col] for pt in fp2_dict.values()]).flatten()
else:
vec1 = fp1_dict[fp1[2][pt]][col]
vec2 = fp2_dict[fp2[2][pt]][col]

if not normalize:
rescale = np.linalg.norm(vec1) ** 2 + np.linalg.norm(vec2) ** 2 - np.dot(vec1, vec2) if tanimoto else 1.0
vec1 = fp1_dict[fp1[2][pt]][col] # type: ignore # noqa:PGH003
vec2 = fp2_dict[fp2[2][pt]][col] # type: ignore # noqa:PGH003

if not normalize and metric == "tanimoto":
rescale = np.linalg.norm(vec1) ** 2 + np.linalg.norm(vec2) ** 2 - np.dot(vec1, vec2)
return np.dot(vec1, vec2) / rescale

if not tanimoto:
if not normalize and metric == "wasserstein":
return wasserstein_distance(
u_values=np.cumsum(vec1 * fp1.bin_width), v_values=np.cumsum(vec2 * fp2.bin_width)
)

if normalize and metric == "cosine-sim":
rescale = np.linalg.norm(vec1) * np.linalg.norm(vec2)
return np.dot(vec1, vec2) / rescale

raise ValueError(
"Cannot compute similarity index. Please set either normalize=True or tanimoto=True or both to False."
)
if not normalize and metric == "cosine-sim":
rescale = 1.0
return np.dot(vec1, vec2) / rescale

raise ValueError("Cannot compute similarity index. When normalize=True, then please set metric=cosine-sim")

@classmethod
def from_dict(cls, dct: dict) -> Self:
Expand Down
161 changes: 160 additions & 1 deletion src/pymatgen/phonon/dos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@

from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal, NamedTuple

import numpy as np
import scipy.constants as const
from monty.functools import lazy_property
from monty.json import MSONable
from scipy.ndimage import gaussian_filter1d
from scipy.stats import wasserstein_distance

from pymatgen.core.structure import Structure
from pymatgen.util.coord import get_linear_interpolated_value

if TYPE_CHECKING:
from collections.abc import Sequence

from numpy.typing import NDArray
from typing_extensions import Self

BOLTZ_THZ_PER_K = const.value("Boltzmann constant in Hz/K") / const.tera # Boltzmann constant in THz/K
Expand Down Expand Up @@ -412,6 +414,163 @@ def get_last_peak(self, threshold: float = 0.05) -> float:

return max(filtered_maxima_freqs)

def get_dos_fp(
self,
binning: bool = True,
min_f: float | None = None,
max_f: float | None = None,
n_bins: int = 256,
normalize: bool = True,
) -> PhononDosFingerprint:
"""Generate the DOS fingerprint.
Args:
binning (bool): If true, the DOS fingerprint is binned using np.linspace and n_bins.
Default is True.
min_f (float): The minimum mode frequency to include in the fingerprint (default is None)
max_f (float): The maximum mode frequency to include in the fingerprint (default is None)
n_bins (int): Number of bins to be used in the fingerprint (default is 256)
normalize (bool): If true, normalizes the area under fp to equal to 1. Default is True.
Returns:
PhononDosFingerprint: The phonon density of states fingerprint
of format (frequencies, densities, n_bins, bin_width)
"""
frequencies = self.frequencies

if max_f is None:
max_f = np.max(frequencies)

if min_f is None:
min_f = np.min(frequencies)

densities = self.densities

if len(frequencies) < n_bins:
inds = np.where((frequencies >= min_f) & (frequencies <= max_f))
return PhononDosFingerprint(frequencies[inds], densities[inds], len(frequencies), np.diff(frequencies)[0])

if binning:
freq_bounds = np.linspace(min_f, max_f, n_bins + 1)
freq = freq_bounds[:-1] + (freq_bounds[1] - freq_bounds[0]) / 2.0
bin_width = np.diff(freq)[0]
else:
freq_bounds = np.array(frequencies)
freq = np.append(frequencies, [frequencies[-1] + np.abs(frequencies[-1]) / 10])
n_bins = len(frequencies)
bin_width = np.diff(frequencies)[0]

dos_rebin = np.zeros(freq.shape)

for ii, e1, e2 in zip(range(len(freq)), freq_bounds[:-1], freq_bounds[1:]):
inds = np.where((frequencies >= e1) & (frequencies < e2))
dos_rebin[ii] = np.sum(densities[inds])
if normalize: # scale DOS bins to make area under histogram equal 1
area = np.sum(dos_rebin * bin_width)
dos_rebin_sc = dos_rebin / area
else:
dos_rebin_sc = dos_rebin

return PhononDosFingerprint(np.array([freq]), dos_rebin_sc, n_bins, bin_width)

@staticmethod
def fp_to_dict(fp: PhononDosFingerprint) -> dict:
"""Convert a fingerprint into a dictionary.
Args:
fp: The DOS fingerprint to be converted into a dictionary
Returns:
dict: A dict of the fingerprint Keys=type, Values=np.ndarray(frequencies, densities)
"""
fp_dict = {}
fp_dict[fp[2]] = np.array([fp[0], fp[1]], dtype="object").T

return fp_dict

@staticmethod
def get_dos_fp_similarity(
fp1: PhononDosFingerprint,
fp2: PhononDosFingerprint,
col: int = 1,
pt: int | str = "All",
normalize: bool = False,
metric: Literal["tanimoto", "wasserstein", "cosine-sim"] = "tanimoto",
) -> float:
"""Calculate the similarity index between two fingerprints.
Args:
fp1 (PhononDosFingerprint): The 1st dos fingerprint object
fp2 (PhononDosFingerprint): The 2nd dos fingerprint object
col (int): The item in the fingerprints (0:frequencies,1: densities) to compute
the similarity index of (default is 1)
pt (int or str) : The index of the point that the dot product is to be taken (default is All)
normalize (bool): If True normalize the scalar product to 1 (default is False)
metric (Literal): Metric used to compute similarity default is "tanimoto".
Raises:
ValueError: If metric other than tanimoto, wasserstein and "cosine-sim" is requested.
ValueError: If normalize is set to True along with the metric.
Returns:
float: Similarity index given by the dot product
"""
if metric not in ["tanimoto", "wasserstein", "cosine-sim"]:
raise ValueError(
"Requested metric not implemented. Currently implemented metrics are tanimoto, "
"wasserstien and cosine-sim."
)

fp1_dict = CompletePhononDos.fp_to_dict(fp1) if not isinstance(fp1, dict) else fp1

fp2_dict = CompletePhononDos.fp_to_dict(fp2) if not isinstance(fp2, dict) else fp2

if pt == "All":
vec1 = np.array([pt[col] for pt in fp1_dict.values()]).flatten()
vec2 = np.array([pt[col] for pt in fp2_dict.values()]).flatten()
else:
vec1 = fp1_dict[fp1[2][pt]][col] # type: ignore # noqa:PGH003
vec2 = fp2_dict[fp2[2][pt]][col] # type: ignore # noqa:PGH003

if not normalize and metric == "tanimoto":
rescale = np.linalg.norm(vec1) ** 2 + np.linalg.norm(vec2) ** 2 - np.dot(vec1, vec2)
return np.dot(vec1, vec2) / rescale

if not normalize and metric == "wasserstein":
return wasserstein_distance(
u_values=np.cumsum(vec1 * fp1.bin_width), v_values=np.cumsum(vec2 * fp2.bin_width)
)

if normalize and metric == "cosine-sim":
rescale = np.linalg.norm(vec1) * np.linalg.norm(vec2)
return np.dot(vec1, vec2) / rescale

if not normalize and metric == "cosine-sim":
rescale = 1.0
return np.dot(vec1, vec2) / rescale

raise ValueError("Cannot compute similarity index. When normalize=True, then please set metric=cosine-sim")


class PhononDosFingerprint(NamedTuple):
"""
Represents a Phonon Density of States (DOS) fingerprint.
This named tuple is used to store information related to the Density of States (DOS)
in a material. It includes the frequencies, densities, number of bins, and bin width.
Args:
frequencies: The frequency values associated with the DOS.
densities: The corresponding density values for each energy.
n_bins: The number of bins used in the fingerprint.
bin_width: The width of each bin in the DOS fingerprint.
"""

frequencies: NDArray
densities: NDArray
n_bins: int
bin_width: float


class CompletePhononDos(PhononDos):
"""This wrapper class defines a total dos, and also provides a list of PDos.
Expand Down
Loading

0 comments on commit 532281f

Please sign in to comment.