blob: 7460d8b88fe9ed8c5fe6b4610435a0be7c346c7f [file]
#!/usr/bin/env python3
# This file is part of Eigen, a lightweight C++ template library
# for linear algebra.
#
# Copyright (C) 2026 The Eigen Authors
#
# This Source Code Form is subject to the terms of the Mozilla
# Public License v. 2.0. If a copy of the MPL was not distributed
# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# REUSE-IgnoreStart
"""
Insert per-file SPDX license tags across the Eigen tree.
Idempotent: files already containing an SPDX-License-Identifier tag are left
untouched. Re-running after a clean run produces no changes.
The license id assigned to each file is determined by `classify(path)`:
- LAPACKE/BLAS bridges with Intel copyright -> BSD-3-Clause
- MORSE-derived cmake/Find*.cmake modules -> BSD-3-Clause
- Reference-BLAS Fortran tests blas/testing/*.f -> BSD-3-Clause
- LevenbergMarquardt MINPACK-derived headers -> MPL-2.0 AND LicenseRef-MINPACK
- Everything else under repo -> MPL-2.0
The comment style is determined by the file extension:
- C / C++ / CUDA / Inc / Fortran-fixed-form ('//', '#', '*' etc.)
Files explicitly excluded from inline SPDX (handled instead by REUSE.toml):
- LICENSES/, COPYING.*, LICENSE (license texts)
- *.dox, *.in, *.md, *.dat, *.png, README*, INSTALL, (assets/templates)
*.def, *.natvis, *.css, *.xsl, *.preamble, *.traits, *.krazy
- .git*, .clang*, .coderabbit*, .gitlab/ (config dotfiles)
- failtest/*.cpp (trivial stubs)
- signature_of_eigen3_matrix_library (sentinel file)
"""
# REUSE-IgnoreEnd
from __future__ import annotations
import argparse
import re
import subprocess
import sys
from pathlib import Path
from typing import Iterable
REPO_ROOT = Path(__file__).resolve().parent.parent
SPDX_RE = re.compile(r"SPDX-License-Identifier\s*:")
# Match REUSE's own copyright detection (case-sensitive `Copyright`); lowercase
# `copyright` lines (typo in some headers) won't be recognised, so we'll still
# inject `SPDX-FileCopyrightText` for those files.
COPYRIGHT_RE = re.compile(r"Copyright\b")
# ---------------------------------------------------------------------------
# Path classification
# ---------------------------------------------------------------------------
# Files containing an Intel BSD-3-Clause copyright header (LAPACKE/BLAS bridges).
# Match by content rather than path so this stays correct as files are added.
INTEL_BSD_MARKER = b"Copyright (c) 2011, Intel Corporation"
INTEL_BSD_MARKER_2010 = b"Copyright (c) 2010, Intel Corp"
# MORSE-derived cmake modules (Univ. of Tennessee 2009-2014).
MORSE_MARKER = b"@copyright (c) 2009-2014 The University of Tennessee"
# Header marker for the LevenbergMarquardt MINPACK-derived files.
MINPACK_MARKER = b"Minpack license"
# Reference-BLAS / LAPACK Fortran (Univ. of Tennessee/Berkeley/Colorado Denver).
REFBLAS_MARKERS = (
b"Reference BLAS is a software package",
b"LAPACK is a software package provided by Univ. of Tennessee",
)
# Apache-2.0 — TensorFlow-derived BFloat16, Codeplay FindComputeCpp.cmake.
APACHE_MARKER = b"Licensed under the Apache License"
def in_failtest(rel: Path) -> bool:
return rel.parts and rel.parts[0] == "failtest"
# Bulk-mapped via REUSE.toml — never insert inline SPDX into these.
SKIP_BY_NAME = {
"CHANGELOG.md",
"INSTALL",
"README.md",
"signature_of_eigen3_matrix_library",
"CLAUDE.md",
}
SKIP_BY_SUFFIX = {
".dox",
".in",
".dat",
".png",
".svg",
".jpg",
".jpeg",
".gif",
".md",
".txt", # README.txt, *.txt config — handled by REUSE.toml
".def", # blas/eigen_blas.def, lapack/eigen_lapack*.def
".natvis",
".css",
".xsl",
".preamble",
".traits",
".krazy",
".yaml", # .coderabbit.yaml — REUSE.toml
".entry", # doc/examples/make_circulant.cpp.entry / .evaluator / ...
".evaluator",
".expression",
".main",
}
# Top-level / specially-handled excluded paths (relative to repo root).
SKIP_PATHS = {
".clang-format",
".clang-tidy",
".coderabbit.yaml",
".git-blame-ignore-revs",
".gitattributes",
".gitignore",
"LICENSE",
"REUSE.toml",
}
def is_skipped(rel: Path) -> bool:
name = rel.name
posix = rel.as_posix()
if posix in SKIP_PATHS:
return True
if posix.startswith("LICENSES/") or posix.startswith("COPYING"):
return True
if posix.startswith(".gitlab/") or posix.startswith(".git/"):
return True
if in_failtest(rel) and rel.suffix == ".cpp":
return True
# Recognise CMakeLists.txt by name BEFORE the .txt suffix-skip below.
if name == "CMakeLists.txt":
return False
if name in SKIP_BY_NAME:
return True
if rel.suffix in SKIP_BY_SUFFIX:
return True
return False
def classify(rel: Path, content: bytes) -> str:
"""Return the SPDX-License-Identifier expression for a file."""
if INTEL_BSD_MARKER in content or INTEL_BSD_MARKER_2010 in content:
return "BSD-3-Clause"
if MORSE_MARKER in content:
return "BSD-3-Clause"
if MINPACK_MARKER in content:
return "MPL-2.0 AND LicenseRef-MINPACK"
if any(m in content for m in REFBLAS_MARKERS):
return "BSD-3-Clause"
if APACHE_MARKER in content:
return "Apache-2.0"
return "MPL-2.0"
# ---------------------------------------------------------------------------
# Comment-style dispatch
# ---------------------------------------------------------------------------
# (suffix -> comment prefix). Files without a known suffix are handled by name.
SLASH_LANGUAGES = {".h", ".hh", ".cpp", ".cc", ".c", ".cu", ".inc", ".js"}
HASH_LANGUAGES = {".cmake", ".sh", ".py", ".ps1", ".yml", ".cfg"}
FORTRAN_LANGUAGES = {".f", ".F", ".f90"}
def is_eigen_umbrella_header(rel: Path) -> bool:
"""`Eigen/Core`, `unsupported/Eigen/Tensor`, etc. — extensionless C++ headers."""
parts = rel.parts
if rel.suffix:
return False
if len(parts) >= 2 and parts[0] == "Eigen" and parts[-1][:1].isupper():
return len(parts) == 2
if (
len(parts) >= 3
and parts[0] == "unsupported"
and parts[1] == "Eigen"
and parts[-1][:1].isupper()
):
# unsupported/Eigen/Tensor, ...
return True
return False
def has_shebang(path: Path) -> bool:
try:
with path.open("rb") as f:
return f.read(2) == b"#!"
except OSError:
return False
def comment_prefix_for(rel: Path, full: Path | None = None) -> str | None:
if rel.suffix in SLASH_LANGUAGES:
return "//"
if rel.suffix in HASH_LANGUAGES:
return "#"
if rel.suffix in FORTRAN_LANGUAGES:
return "*>"
name = rel.name
if name == "CMakeLists.txt":
return "#"
if is_eigen_umbrella_header(rel):
return "//"
# Extensionless executable scripts (e.g. scripts/eigen_gen_docs).
if not rel.suffix and full is not None and has_shebang(full):
return "#"
return None
# ---------------------------------------------------------------------------
# Insertion strategy
# ---------------------------------------------------------------------------
def insert_spdx(
path: Path, content: bytes, spdx_id: str, comment: str
) -> bytes | None:
"""Return new file bytes with the SPDX line inserted, or None for no-op."""
full_text = content.decode("utf-8", errors="replace")
if SPDX_RE.search(full_text):
return None # Already tagged.
head = full_text[:4096]
has_copyright = COPYRIGHT_RE.search(head) is not None
# Match the file's line-ending style (CRLF vs LF) so we don't introduce
# mixed endings that clang-format will then "fix".
eol = "\r\n" if b"\r\n" in content[:4096] else "\n"
# REUSE-IgnoreStart
spdx_lines = []
if not has_copyright:
spdx_lines.append(f"{comment} SPDX-FileCopyrightText: The Eigen Authors{eol}")
spdx_lines.append(f"{comment} SPDX-License-Identifier: {spdx_id}{eol}")
spdx_line = "".join(spdx_lines).encode()
# REUSE-IgnoreEnd
lines = full_text.splitlines(keepends=True)
# Two header families to consider:
#
# (a) Existing C-line-comment MPL prose ("// This file is part of Eigen, ...
# // ... obtain one at http://mozilla.org/MPL/2.0/."): append SPDX
# directly after the prose block (last consecutive comment line at
# top-of-file).
#
# (b) Existing C-block-comment header (Intel "/* Copyright (c) 2011, Intel
# Corporation ... */"): append a comment line right after the closing
# "*/".
#
# (c) Hash-comment prose (.cmake / .gitlab-ci.yml / etc.): append after the
# last consecutive '#'-comment line at top.
#
# (d) Fortran '*'-comment prose (Reference BLAS): append a `*>` line after
# the last consecutive '*'-prefixed line at top.
#
# (e) No header: insert SPDX as the very first line, then a blank line.
#
# Files starting with `#!shebang` or a `#ifndef` guard — we always insert
# the SPDX line above the guard but below any shebang.
insert_at = 0
has_shebang = lines and lines[0].startswith("#!")
if has_shebang:
insert_at = 1
# (b) C-block comment opening at index `insert_at`.
if insert_at < len(lines) and lines[insert_at].lstrip().startswith("/*"):
for i in range(insert_at, len(lines)):
if "*/" in lines[i]:
insert_at = i + 1
# Skip a single blank line after the block, if any.
break
new_block = spdx_line.decode()
# We want SPDX as a // line after the */.
return ("".join(lines[:insert_at]) + new_block + "".join(lines[insert_at:])).encode()
# (a, c, d) Detect a top-of-file run of comment lines and insert after.
end_of_block = insert_at
if comment == "//":
marker_prefixes = ("//",)
elif comment == "#":
marker_prefixes = ("#",)
elif comment == "*>":
# Fortran free-form file headers begin with `*` or `*>`.
marker_prefixes = ("*",)
else:
marker_prefixes = (comment,)
while end_of_block < len(lines):
s = lines[end_of_block].lstrip()
if any(s.startswith(p) for p in marker_prefixes):
end_of_block += 1
continue
break
if end_of_block > insert_at:
# Append SPDX after the existing prose block.
return (
"".join(lines[:end_of_block])
+ spdx_line.decode()
+ "".join(lines[end_of_block:])
).encode()
# (e) No header — insert at the top (after shebang if present), and
# follow with a single blank line *unless* the file already starts with
# one (otherwise clang-format collapses the duplicate).
next_is_blank = (
insert_at < len(lines) and lines[insert_at].strip() == ""
)
blank = "" if next_is_blank else eol
return (
"".join(lines[:insert_at])
+ spdx_line.decode()
+ blank
+ "".join(lines[insert_at:])
).encode()
# ---------------------------------------------------------------------------
# Driver
# ---------------------------------------------------------------------------
def tracked_files() -> Iterable[Path]:
out = subprocess.check_output(
["git", "ls-files"], cwd=REPO_ROOT, text=True
)
for line in out.splitlines():
if line:
yield Path(line)
def main() -> int:
p = argparse.ArgumentParser(description=__doc__)
p.add_argument(
"--check",
action="store_true",
help="Don't write; exit 1 if any file would change.",
)
p.add_argument(
"--paths",
nargs="*",
help="Limit to these paths (relative to repo root). Default: all tracked.",
)
p.add_argument(
"-v", "--verbose",
action="store_true",
help="Print every file decision (skip / classify / change).",
)
args = p.parse_args()
if args.paths:
targets = [Path(p) for p in args.paths]
else:
targets = list(tracked_files())
changed = []
for rel in targets:
if is_skipped(rel):
if args.verbose:
print(f"[skip-rule] {rel}")
continue
full = REPO_ROOT / rel
prefix = comment_prefix_for(rel, full)
if prefix is None:
if args.verbose:
print(f"[skip-ext] {rel}")
continue
try:
content = full.read_bytes()
except (FileNotFoundError, IsADirectoryError):
continue
spdx_id = classify(rel, content)
new_content = insert_spdx(rel, content, spdx_id, prefix)
if new_content is None:
if args.verbose:
print(f"[ok] {rel} ({spdx_id})")
continue
if args.check:
print(f"[would-change] {rel} ({spdx_id})")
changed.append(rel)
continue
full.write_bytes(new_content)
print(f"[changed] {rel} ({spdx_id})")
changed.append(rel)
if args.check and changed:
print(f"\n{len(changed)} file(s) need SPDX headers.", file=sys.stderr)
return 1
if not args.check:
print(f"\nUpdated {len(changed)} file(s).")
return 0
if __name__ == "__main__":
sys.exit(main())