📝 Update link syntax to minimal Markdown by tiangolo · Pull Request #1800

📝 Update link syntax to minimal Markdown by tiangolo · Pull Request #1800 · fastapi/sqlmodel
No longer use the classes external-link and internal-link, remove target=_blank, which is now added automatically by JS, and move from HTML to Markdown syntax (as now it no longer needs anything from HTML).
The migration was done automatically with this script, generated with Claude Opus 4.6, the results were reviewed by (human) hand one by one.
#!/usr/bin/env python3
"""
Remove .external-link and .internal-link classes and target=_blank from docs,
converting HTML <a> tags to Markdown links where possible.

Handles these patterns:

1. HTML external/internal links (simple text content):
   <a href="URL" class="external-link" target="_blank">Text</a>
   → [Text](URL)

2. HTML links with <strong> content:
   <a href="URL" class="external-link" target="_blank"><strong>Text</strong></a>
   → [**Text**](URL)

3. HTML links with <code> content:
   <a href="URL" class="external-link" target="_blank"><code>text</code></a>
   → [`text`](URL)

4. HTML links with <abbr> (converted, abbr kept inline):
   <a href="URL" class="external-link" target="_blank"><abbr title="T">Text</abbr></a>
   → [<abbr title="T">Text</abbr>](URL)

5. HTML links with mixed <abbr> + <code>/<strong> (converted):
   <a href="URL" class="external-link" target="_blank"><abbr>X</abbr> docs for <code>POST</code></a>
   → [<abbr>X</abbr> docs for `POST`](URL)

6. Markdown attr_list (.internal-link or .external-link and target=_blank removed):
   [Text](URL){.internal-link target=_blank}
   → [Text](URL)

7. HTML <a> tags without link classes but with target="_blank" (class stripped, target removed):
   <a href="URL" target="_blank">Text</a>
   → [Text](URL)

8. Markdown attr_list with only target=_blank (removed):
   [Text](URL){target=_blank}
   → [Text](URL)
"""

import re
import sys
from pathlib import Path


def convert_html_link_to_markdown(match: str) -> str:
    """Convert <a> tag to markdown link, stripping class and target."""
    # Extract href
    href_m = re.search(r'href="([^"]*)"', match)
    if not href_m:
        return match
    href = href_m.group(1)

    # Extract inner content (between > and </a>)
    content_m = re.search(r'>(.+?)</a>\s*$', match, re.DOTALL)
    if not content_m:
        return match
    content = content_m.group(1).strip()

    # Convert inline HTML to markdown equivalents
    # Convert <strong>text</strong> → **text**
    content = re.sub(r'<strong>(.*?)</strong>', r'**\1**', content)
    # Convert <code>text</code> → `text`
    content = re.sub(r'<code>(.*?)</code>', r'`\1`', content)
    # Convert <em>text</em> → *text*
    content = re.sub(r'<em>(.*?)</em>', r'*\1*', content)

    # If there's still HTML other than <abbr> in the content, keep as HTML
    # but remove class and target attributes
    content_without_abbr = re.sub(r'<abbr\s[^>]*>.*?</abbr>', '', content)
    if '<' in content_without_abbr:
        result = re.sub(r'\s*class="(?:external|internal)-link"', '', match)
        result = re.sub(r'\s*target="[^"]*"', '', result)
        return result

    return f"[{content}]({href})"


def process_file(filepath: Path, dry_run: bool = False) -> tuple[int, list[str]]:
    """Process a single file. Returns (change_count, list of changes)."""
    text = filepath.read_text(encoding="utf-8")
    original = text
    changes: list[str] = []

    # Pattern 1: HTML <a> tags with class="external-link" or class="internal-link"
    # Handles attributes in any order
    html_class_pattern = re.compile(
        r'<a\s+'
        r'(?=[^>]*class="(?:external|internal)-link")'  # must have the class
        r'[^>]*?'                                        # other attrs before href
        r'href="[^"]*"'                                  # href
        r'[^>]*?'                                        # other attrs
        r'>'                                             # end of opening tag
        r'.+?'                                           # content
        r'</a>',                                         # closing tag
        re.DOTALL,
    )

    def replace_html_class(m: re.Match) -> str:
        result = convert_html_link_to_markdown(m.group(0))
        if result != m.group(0):
            changes.append(f"  HTML class → MD: {m.group(0)[:80]}...")
        return result

    text = html_class_pattern.sub(replace_html_class, text)

    # Pattern 2: HTML <a> tags with target="_blank" but no link class
    # (these may have been left by previous runs or were never classed)
    html_target_pattern = re.compile(
        r'<a\s+'
        r'(?=[^>]*target="_blank")'                      # must have target=_blank
        r'(?![^>]*class=")'                              # must NOT have a class attr
        r'[^>]*?'                                        # other attrs before href
        r'href="[^"]*"'                                  # href
        r'[^>]*?'                                        # other attrs
        r'>'                                             # end of opening tag
        r'.+?'                                           # content
        r'</a>',                                         # closing tag
        re.DOTALL,
    )

    def replace_html_target(m: re.Match) -> str:
        result = convert_html_link_to_markdown(m.group(0))
        if result != m.group(0):
            changes.append(f"  HTML target → MD: {m.group(0)[:80]}...")
        return result

    text = html_target_pattern.sub(replace_html_target, text)

    # Pattern 3: Markdown attr_list with .external-link or .internal-link
    # [Text](URL){.internal-link target=_blank}  →  [Text](URL)
    # [Text](URL){.external-link target=_blank}  →  [Text](URL)
    md_class_pattern = re.compile(
        r'(\[[^\]]+\]\([^)]+\))'     # [text](url)
        r'\{'                         # {
        r'\.(?:external|internal)-link'  # .external-link or .internal-link
        r'\s*'                        # optional space
        r'([^}]*?)'                   # remaining attrs (e.g. target=_blank)
        r'\}'                         # }
    )

    def replace_md_class(m: re.Match) -> str:
        link_part = m.group(1)
        remaining_attrs = m.group(2).strip()
        # Remove target=_blank from remaining attrs
        remaining_attrs = re.sub(r'target=_blank\s*', '', remaining_attrs).strip()
        if remaining_attrs:
            result = f"{link_part}{{{remaining_attrs}}}"
        else:
            result = link_part
        changes.append(f"  MD class attr: {m.group(0)[:80]}...")
        return result

    text = md_class_pattern.sub(replace_md_class, text)

    # Pattern 4: Markdown attr_list with only target=_blank (no class)
    # [Text](URL){target=_blank}  →  [Text](URL)
    md_target_pattern = re.compile(
        r'(\[[^\]]+\]\([^)]+\))'     # [text](url)
        r'\{target=_blank\}'          # {target=_blank}
    )

    def replace_md_target(m: re.Match) -> str:
        changes.append(f"  MD target attr: {m.group(0)[:80]}...")
        return m.group(1)

    text = md_target_pattern.sub(replace_md_target, text)

    if text != original:
        if not dry_run:
            filepath.write_text(text, encoding="utf-8")
        return len(changes), changes
    return 0, []


def main() -> None:
    dry_run = "--dry-run" in sys.argv
    docs_dir = Path("docs")

    if not docs_dir.exists():
        print(f"Error: {docs_dir} not found. Run from the repo root.")
        sys.exit(1)

    total_changes = 0
    files_changed = 0

    for md_file in sorted(docs_dir.rglob("*.md")):
        count, changes = process_file(md_file, dry_run=dry_run)
        if count:
            files_changed += 1
            total_changes += count
            print(f"{md_file} ({count} changes)")
            for c in changes:
                print(c)

    mode = "DRY RUN" if dry_run else "APPLIED"
    print(f"\n{mode}: {total_changes} changes across {files_changed} files")


if __name__ == "__main__":
    main()