"""MaterialRegistry — global singleton for material lookup.
Owns the complete lookup table for both built-in and user-registered materials.
Wraps the existing ``catalog_nk.csv`` load and adds user catalog state.
The hot path (built-in CSV lookup) adds zero overhead compared to PR1: the
built-in DataFrame is lazy-loaded once per process and cached, exactly as before.
Kramer Harrison, 2025
"""
from __future__ import annotations
import contextlib
import pathlib
import tempfile
import warnings
from importlib import resources
import pandas as pd
import yaml
from optiland.materials.material_spec import MatchPolicy
from optiland.materials.warnings import OptilandMaterialWarning
_CATALOG_CSV = str(resources.files("optiland.database").joinpath("catalog_nk.csv"))
_DATA_NK_DIR = str(resources.files("optiland.database").joinpath("data-nk"))
def _levenshtein(s1: str, s2: str) -> int:
"""Compute the Levenshtein edit distance between two strings."""
rows, cols = len(s1) + 1, len(s2) + 1
dist = [[0] * cols for _ in range(rows)]
for i in range(1, rows):
dist[i][0] = i
for j in range(1, cols):
dist[0][j] = j
for i in range(1, rows):
for j in range(1, cols):
cost = 0 if s1[i - 1] == s2[j - 1] else 1
dist[i][j] = min(
dist[i - 1][j] + 1,
dist[i][j - 1] + 1,
dist[i - 1][j - 1] + cost,
)
return dist[-1][-1]
def _catalog_dir_from_filename(filename: str, group: str = "") -> str:
"""Extract the manufacturer catalog name from a filename path.
For glass entries whose path starts with ``glass/``, the manufacturer is
always the second path segment (``glass/{manufacturer}/...``), regardless
of nesting depth. For all other entries the species name is the
second-to-last segment.
"""
parts = filename.replace("\\", "/").split("/")
if group.lower() == "glass" and parts[0].lower() == "glass" and len(parts) >= 3:
return parts[1]
return parts[-2] if len(parts) >= 3 else ""
def _extract_wavelength_range(data: dict) -> tuple[float, float]:
"""Extract min/max wavelength (µm) from a refractiveindex.info YAML payload."""
for item in data.get("DATA", []):
raw = item.get("data", "")
if not raw:
continue
wls: list[float] = []
for line in raw.strip().splitlines():
parts = line.split()
if parts:
with contextlib.suppress(ValueError):
wls.append(float(parts[0]))
if wls:
return min(wls), max(wls)
# Formula entries may have a wavelength_range field
wl_range = item.get("wavelength_range", "")
if wl_range:
parts = str(wl_range).split()
if len(parts) >= 2:
try:
return float(parts[0]), float(parts[1])
except ValueError:
pass
return 0.0, 100.0
[docs]
class MaterialRegistry:
"""Global singleton owning all material lookup state (built-in + user).
Access via :meth:`MaterialRegistry.instance`. All methods are safe to
call from any thread after the first :meth:`instance` call completes.
On the first call to :meth:`instance` the registry checks for
``~/.optiland/catalogs/``. Each subdirectory there is ingested via
:meth:`load_catalog` as a best-effort operation; failures emit
:class:`~optiland.materials.warnings.OptilandMaterialWarning` and are
skipped, never raising.
Args:
None — instantiate via :meth:`instance`.
"""
_instance: MaterialRegistry | None = None
def __init__(self) -> None:
self.__built_in_df: pd.DataFrame | None = None
self._user_entries: list[dict] = []
self._user_temp_files: list[str] = []
self._combined_cache: pd.DataFrame | None = None
[docs]
@classmethod
def instance(cls) -> MaterialRegistry:
"""Return the process-wide singleton, creating it on first call."""
if cls._instance is None:
obj = object.__new__(cls)
obj.__init__() # type: ignore[misc]
cls._instance = obj
cls._instance._auto_discover()
return cls._instance
# ------------------------------------------------------------------
# Built-in catalog
# ------------------------------------------------------------------
@property
def built_in_df(self) -> pd.DataFrame:
"""Lazy-loaded, cached built-in catalog DataFrame."""
if self.__built_in_df is None:
self.__built_in_df = pd.read_csv(_CATALOG_CSV)
return self.__built_in_df
# ------------------------------------------------------------------
# User catalog
# ------------------------------------------------------------------
[docs]
def register(self, name: str, catalog: str, data: dict) -> None:
"""Register a material programmatically.
The ``data`` dict must follow the refractiveindex.info YAML schema.
If a built-in or previously-registered entry with the same
``(name, catalog)`` key exists, it is shadowed and a warning is
emitted.
Args:
name: Material name (used for lookup).
catalog: Catalog name (e.g. ``"internal"``).
data: refractiveindex.info YAML payload as a Python dict.
"""
# Write data to a named temp file so MaterialFile can read it.
# NamedTemporaryFile with delete=False is used so the path persists
# after close; SIM115 does not apply here because we need tmp.name.
tmp = tempfile.NamedTemporaryFile( # noqa: SIM115
delete=False,
suffix=".yml",
prefix=f"{name}_",
mode="w",
encoding="utf-8",
)
yaml.dump(data, tmp)
tmp.flush()
tmp.close()
self._user_temp_files.append(tmp.name)
min_wl, max_wl = _extract_wavelength_range(data)
reference = data.get("REFERENCE", catalog)
entry = {
"group": "user",
"category_name": catalog,
"category_name_full": catalog,
"reference": reference,
"name": name,
"filename": tmp.name,
"min_wavelength": min_wl,
"max_wavelength": max_wl,
"filename_no_ext": name,
"catalog_dir": catalog.lower(),
}
self._warn_if_shadow(name, catalog)
self._user_entries.append(entry)
self._combined_cache = None # invalidate cache
[docs]
def register_file(self, path: str | pathlib.Path) -> None:
"""Load a single refractiveindex.info-format YAML file.
The catalog name is inferred from the parent directory name. The
material name is the filename stem (without extension).
Args:
path: Path to a refractiveindex.info YAML file.
"""
p = pathlib.Path(path)
name = p.stem
catalog = p.parent.name
with open(p, encoding="utf-8") as fh:
data = yaml.safe_load(fh)
reference = data.get("REFERENCE", catalog) if data else catalog
min_wl, max_wl = _extract_wavelength_range(data or {})
entry = {
"group": "user",
"category_name": catalog,
"category_name_full": catalog,
"reference": reference,
"name": name,
"filename": str(p.resolve()),
"min_wavelength": min_wl,
"max_wavelength": max_wl,
"filename_no_ext": name,
"catalog_dir": catalog.lower(),
}
self._warn_if_shadow(name, catalog)
self._user_entries.append(entry)
self._combined_cache = None
[docs]
def load_catalog(self, directory: str | pathlib.Path) -> None:
"""Load all YAML files found in ``directory``.
If a ``catalog.csv`` index exists (same schema as the built-in
``catalog_nk.csv``), it is used directly. Otherwise each ``.yml``
file is registered via :meth:`register_file`.
Args:
directory: Path to a directory of refractiveindex.info YAML files.
"""
d = pathlib.Path(directory)
if not d.is_dir():
return
index_csv = d / "catalog.csv"
if index_csv.exists():
extra_df = pd.read_csv(index_csv)
catalog_name = d.name.lower()
if "catalog_dir" not in extra_df.columns:
extra_df["catalog_dir"] = catalog_name
# Resolve relative filenames against the directory
def _resolve_fn(fn: str) -> str:
if pathlib.Path(fn).is_absolute():
return fn
return str((d / fn).resolve())
extra_df["filename"] = extra_df["filename"].apply(_resolve_fn)
for _, row in extra_df.iterrows():
self._warn_if_shadow(row.get("name", ""), row.get("catalog_dir", ""))
self._user_entries.extend(extra_df.to_dict("records"))
self._combined_cache = None
else:
for yml_file in sorted(d.glob("*.yml")):
try:
self.register_file(yml_file)
except Exception as exc:
warnings.warn(
f"Failed to load material file '{yml_file}': {exc}",
OptilandMaterialWarning,
stacklevel=2,
)
# ------------------------------------------------------------------
# Resolution
# ------------------------------------------------------------------
[docs]
def resolve(
self,
name: str,
catalog: str | None = None,
reference: str | None = None,
match_policy: MatchPolicy = MatchPolicy.WARN,
min_wavelength: float | None = None,
max_wavelength: float | None = None,
) -> str:
"""Return the absolute path to the resolved YAML data file.
Args:
name: Material name to search for.
catalog: Manufacturer catalog to restrict lookup to.
reference: Citation string for further disambiguation.
match_policy: Controls fuzzy-match warnings/errors.
min_wavelength: Minimum wavelength filter (µm).
max_wavelength: Maximum wavelength filter (µm).
Returns:
Absolute path to the resolved YAML data file.
Raises:
ValueError: If no match found, or if ``match_policy='strict'``
and the match is not exact / is ambiguous.
"""
path, _ = self._resolve_with_row(
name, catalog, reference, match_policy, min_wavelength, max_wavelength
)
return path
def _resolve_with_row(
self,
name: str,
catalog: str | None,
reference: str | None,
match_policy: MatchPolicy,
min_wavelength: float | None,
max_wavelength: float | None,
) -> tuple[str, dict]:
"""Resolve a material and return ``(path, metadata_row_dict)``."""
df = self._get_combined_df()
# If catalog given, pre-filter to that manufacturer
if catalog is not None:
catalog_lower = catalog.lower()
df = df[df["catalog_dir"].str.lower() == catalog_lower].copy()
if df.empty:
raise ValueError(f"No catalog '{catalog}' found in material database.")
filtered_df = self._find_matches(
df, name, reference, min_wavelength, max_wavelength
)
if filtered_df.empty:
msg = f"No matches found for material '{name}'"
if catalog:
msg += f" in catalog '{catalog}'"
if reference:
msg += f" with reference '{reference}'"
raise ValueError(msg)
best_score = filtered_df["similarity_score"].iloc[0]
exact_mask = filtered_df["similarity_score"] == 0
n_exact_files = (
int(filtered_df.loc[exact_mask, "filename"].nunique())
if exact_mask.any()
else 0
)
ambiguous_exact = best_score == 0 and n_exact_files > 1
if best_score > 0 or ambiguous_exact:
if catalog is not None:
if match_policy == MatchPolicy.STRICT:
raise ValueError(
f"No exact match for '{name}' in catalog '{catalog}'. "
"Use the exact name or a less strict match_policy."
)
if best_score > 0:
resolved = filtered_df.iloc[0]["name"]
warnings.warn(
f"No exact match for '{name}' in catalog '{catalog}'; "
f"resolved to '{resolved}'. Use exact name to silence.",
OptilandMaterialWarning,
stacklevel=4,
)
else:
if match_policy == MatchPolicy.STRICT:
top = filtered_df.head(5)["name"].tolist()
raise ValueError(
f"No exact match for material '{name}'. "
f"Top candidates: {top}. "
"Use match_policy='warn' or 'best' for fuzzy matching."
)
if match_policy == MatchPolicy.WARN and best_score > 0:
resolved = filtered_df.iloc[0]["name"]
warnings.warn(
f"Material '{name}' resolved to '{resolved}' via fuzzy match.",
OptilandMaterialWarning,
stacklevel=4,
)
row = filtered_df.iloc[0].to_dict()
filename = row["filename"]
# Built-in filenames are relative paths stored in CSV; resolve them.
if not pathlib.Path(filename).is_absolute():
full_path = str(pathlib.Path(_DATA_NK_DIR) / filename)
else:
full_path = filename
return full_path, row
# ------------------------------------------------------------------
# Discovery
# ------------------------------------------------------------------
[docs]
def list_groups(self) -> list[str]:
"""Return sorted unique group names (built-in + user-registered)."""
df = self._get_combined_df()
return sorted(df["group"].dropna().unique().tolist())
[docs]
def list_catalogs(self, group: str | None = None) -> list[str]:
"""Return sorted unique catalog names, optionally filtered by group.
Args:
group: If given, restrict results to this group (e.g. ``'glass'``,
``'main'``, ``'organic'``, ``'other'``, ``'3d'``).
"""
df = self._get_combined_df()
if group is not None:
df = df[df["group"].str.lower() == group.lower()]
return sorted(df["catalog_dir"].dropna().unique().tolist())
[docs]
def list_materials(self, catalog: str | None = None) -> list[str]:
"""Return sorted material names, optionally filtered to one catalog.
Args:
catalog: If given, restrict results to this catalog name.
Returns:
Sorted list of material ``filename_no_ext`` values.
"""
df = self._get_combined_df()
if catalog is not None:
df = df[df["catalog_dir"].str.lower() == catalog.lower()]
return sorted(df["filename_no_ext"].dropna().unique().tolist())
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _get_combined_df(self) -> pd.DataFrame:
"""Return built-in + user entries as a single DataFrame (cached)."""
if self._combined_cache is not None:
return self._combined_cache
built_in = self.built_in_df.copy()
built_in["catalog_dir"] = built_in.apply(
lambda r: _catalog_dir_from_filename(r["filename"], r.get("group", "")),
axis=1,
)
if not self._user_entries:
self._combined_cache = built_in
return self._combined_cache
user_df = pd.DataFrame(self._user_entries)
# User entries shadow built-ins with the same (filename_no_ext, catalog_dir)
shadow_keys = set(
zip(
user_df["filename_no_ext"].str.lower(),
user_df["catalog_dir"].str.lower(),
strict=False,
)
)
mask = ~built_in.apply(
lambda r: (
(
r["filename_no_ext"].lower(),
r.get("catalog_dir", "").lower(),
)
in shadow_keys
),
axis=1,
)
self._combined_cache = pd.concat([built_in[mask], user_df], ignore_index=True)
return self._combined_cache
def _find_matches(
self,
df: pd.DataFrame,
name: str,
reference: str | None,
min_wavelength: float | None,
max_wavelength: float | None,
) -> pd.DataFrame:
"""Find and score candidate rows for the given name."""
name_lower = name.lower()
dfi = df[
df["category_name"].str.lower().str.contains(name_lower, na=False)
| df["name"].str.lower().str.contains(name_lower, na=False)
| df["filename_no_ext"].str.lower().str.contains(name_lower, na=False)
].copy()
if reference:
ref_lower = reference.lower()
dfi = dfi[
dfi["category_name"].str.lower().str.contains(ref_lower, na=False)
| dfi["category_name_full"]
.str.lower()
.str.contains( # noqa: E501
ref_lower, na=False
)
| dfi["reference"].str.lower().str.contains(ref_lower, na=False)
| dfi["name"].str.lower().str.contains(ref_lower, na=False)
| dfi["filename"].str.lower().str.contains(ref_lower, na=False)
]
if min_wavelength is not None:
dfi = dfi[
(dfi["min_wavelength"] <= min_wavelength)
& (dfi["max_wavelength"] >= min_wavelength)
]
if max_wavelength is not None:
dfi = dfi[
(dfi["min_wavelength"] <= max_wavelength)
& (dfi["max_wavelength"] >= max_wavelength)
]
if dfi.empty:
return pd.DataFrame()
dfi["similarity_score"] = dfi.apply(
lambda row: min(
_levenshtein(name_lower, row["category_name"].lower()),
_levenshtein(name_lower, row["name"].lower()),
_levenshtein(name_lower, row["filename_no_ext"].lower()),
),
axis=1,
)
return dfi.sort_values("similarity_score").reset_index(drop=True)
def _warn_if_shadow(self, name: str, catalog: str) -> None:
"""Emit a warning if (name, catalog) shadows an existing entry."""
name_lower = name.lower()
catalog_lower = catalog.lower()
# Check built-in
bi = self.built_in_df
bi_catalog_dirs = bi.apply(
lambda r: _catalog_dir_from_filename(r["filename"], r.get("group", "")),
axis=1,
).str.lower()
if (
(bi["filename_no_ext"].str.lower() == name_lower)
& (bi_catalog_dirs == catalog_lower)
).any():
warnings.warn(
f"User-registered material '{name}' (catalog='{catalog}') "
"shadows a built-in entry.",
OptilandMaterialWarning,
stacklevel=3,
)
return
# Check existing user entries
for entry in self._user_entries:
if (
entry.get("filename_no_ext", "").lower() == name_lower
and entry.get("catalog_dir", "").lower() == catalog_lower
):
warnings.warn(
f"User-registered material '{name}' (catalog='{catalog}') "
"overwrites a previously registered user entry.",
OptilandMaterialWarning,
stacklevel=3,
)
return
def _auto_discover(self) -> None:
"""Check ~/.optiland/catalogs/ and ingest any subdirectories found."""
user_catalogs = pathlib.Path.home() / ".optiland" / "catalogs"
if not user_catalogs.is_dir():
return
for subdir in sorted(user_catalogs.iterdir()):
if subdir.is_dir():
try:
self.load_catalog(subdir)
except Exception as exc:
warnings.warn(
f"Auto-discovery: failed to load catalog "
f"'{subdir.name}': {exc}",
OptilandMaterialWarning,
stacklevel=1,
)