From 5614362ebc301fe82188a8ce28894561108586f9 Mon Sep 17 00:00:00 2001
From: Sahu97494 <anahita.sahu@alumni.ashoka.edu.in>
Date: Thu, 18 Sep 2025 15:24:11 -0700
Subject: [PATCH] llms.txt script

You can test this out by dropping it into the main directory of the docs-v2 repo and running node scrape_mdx_to_llms.js to see the output in the llms.txt file that is generated. Expected outcome here is to scrape:
the title
each section
section name
each link
link title
link url
---
 main/scrape_mdx_to_llms.js | 408 +++++++++++++++++++++++++++++++++++++
 1 file changed, 408 insertions(+)
 create mode 100644 main/scrape_mdx_to_llms.js

diff --git a/main/scrape_mdx_to_llms.js b/main/scrape_mdx_to_llms.js
new file mode 100644
index 000000000..f69937a88
--- /dev/null
+++ b/main/scrape_mdx_to_llms.js
@@ -0,0 +1,408 @@
+#!/usr/bin/env node
+
+const fs = require('fs');
+const path = require('path');
+
+/**
+ * Custom file scraper to generate llms.txt from .mdx files
+ * Uses docs.json to find rendered pages and extracts all content with sections
+ */
+
+class MDXScraper {
+  constructor(baseURL = 'https://auth0-migration.mintlify.app') {
+    this.output = [];
+    this.mainFolder = path.join(__dirname, 'docs-v2', 'main');
+    this.docsJsonPath = path.join(this.mainFolder, 'docs.json');
+    this.renderedPages = new Set();
+    this.baseURL = baseURL;
+  }
+
+  /**
+   * Parse docs.json to get all pages that are actually rendered
+   */
+  parseDocsJson() {
+    try {
+      const docsContent = fs.readFileSync(this.docsJsonPath, 'utf-8');
+      const docsJson = JSON.parse(docsContent);
+
+      // Recursive function to extract all page paths from the navigation structure
+      const extractPages = (obj) => {
+        if (typeof obj === 'string') {
+          // Direct page path
+          if (obj.startsWith('/docs') || obj.startsWith('docs/')) {
+            this.renderedPages.add(obj.replace(/^\//, '')); // Remove leading slash
+          }
+          return;
+        }
+
+        if (Array.isArray(obj)) {
+          obj.forEach(extractPages);
+          return;
+        }
+
+        if (typeof obj === 'object' && obj !== null) {
+          // Handle pages arrays
+          if (obj.pages) {
+            extractPages(obj.pages);
+          }
+
+          // Handle other potential arrays with pages
+          Object.values(obj).forEach(value => {
+            if (Array.isArray(value) || (typeof value === 'object' && value !== null)) {
+              extractPages(value);
+            }
+          });
+        }
+      };
+
+      // Start extraction from navigation
+      if (docsJson.navigation) {
+        extractPages(docsJson.navigation);
+      }
+
+      console.log(`Found ${this.renderedPages.size} rendered pages in docs.json`);
+      return Array.from(this.renderedPages);
+
+    } catch (error) {
+      console.warn(`Error parsing docs.json: ${error.message}`);
+      return [];
+    }
+  }
+
+  /**
+   * Convert page path to file path
+   */
+  pageToFilePath(pagePath) {
+    // Remove leading slash if present
+    const cleanPath = pagePath.replace(/^\//, '');
+
+    // Try different file extensions and variations
+    const possiblePaths = [
+      path.join(this.mainFolder, `${cleanPath}.mdx`),
+      path.join(this.mainFolder, cleanPath, 'index.mdx'),
+    ];
+
+    for (const filePath of possiblePaths) {
+      if (fs.existsSync(filePath)) {
+        return filePath;
+      }
+    }
+
+    return null;
+  }
+
+  /**
+   * Extract content from MDX file with comprehensive section and link extraction
+   */
+  extractContent(filePath, pagePath) {
+    const content = fs.readFileSync(filePath, 'utf-8');
+
+    // Extract frontmatter title
+    const frontmatterMatch = content.match(/---\n([\s\S]*?)\n---/);
+    let title = '';
+
+    if (frontmatterMatch) {
+      const frontmatter = frontmatterMatch[1];
+      const titleMatch = frontmatter.match(/title:\s*['"]?([^'"\n]+)['"]?/);
+      if (titleMatch) {
+        title = titleMatch[1].trim();
+      }
+    }
+
+    // If no frontmatter title, try to find first h1
+    if (!title) {
+      const h1Match = content.match(/^#\s+(.+)$/m);
+      if (h1Match) {
+        title = h1Match[1].trim();
+      }
+    }
+
+    // Use page path as fallback
+    if (!title) {
+      const pathParts = pagePath.replace(/^docs\//, '').split('/');
+      title = pathParts[pathParts.length - 1].replace(/-/g, ' ');
+    }
+
+    return {
+      title,
+      url: `${this.baseURL}/${pagePath}`,
+      pagePath,
+      sections: this.extractSectionsWithDescriptionsAndLinks(content)
+    };
+  }
+
+  /**
+   * Comprehensive extraction of sections with descriptions and links
+   */
+  extractSectionsWithDescriptionsAndLinks(content) {
+    const sections = [];
+    const lines = content.split('\n');
+    let currentSection = null;
+
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i].trim();
+
+      // 1. Extract markdown headers (##, ###, ####)
+      if (line.match(/^#{2,4}\s+/)) {
+        if (currentSection) sections.push(currentSection);
+
+        const level = (line.match(/^#+/)[0]).length;
+        const name = line.replace(/^#+\s+/, '').trim();
+
+        currentSection = {
+          level,
+          name: this.cleanHeaderText(name),
+          description: null,
+          links: []
+        };
+      }
+
+      // 2. Extract HTML headers (<h1>, <h2>, <h3>, <h4>)
+      else if (line.match(/<h[1-4][^>]*>([^<]+)<\/h[1-4]>/)) {
+        const match = line.match(/<h([1-4])[^>]*>([^<]+)<\/h[1-4]>/);
+        if (match) {
+          if (currentSection) sections.push(currentSection);
+
+          const level = parseInt(match[1]) + 1; // Convert h1->##, h2->###, etc.
+          const name = match[2].trim();
+
+          currentSection = {
+            level,
+            name: this.cleanHeaderText(name),
+            description: null,
+            links: []
+          };
+        }
+      }
+
+      // 3. Extract descriptions (#####)
+      else if (line.startsWith('##### ') && currentSection) {
+        currentSection.description = line.substring(6).trim();
+      }
+
+      // 4. Extract ALL href attributes and collect links
+      if (currentSection) {
+        const foundLinks = this.extractLinksFromLine(line);
+        currentSection.links.push(...foundLinks);
+      }
+    }
+
+    if (currentSection) sections.push(currentSection);
+    return sections;
+  }
+
+  /**
+   * Clean header text from markdown formatting
+   */
+  cleanHeaderText(text) {
+    return text.replace(/\*\*(.*?)\*\*/g, '$1')
+               .replace(/\*(.*?)\*/g, '$1')
+               .replace(/`(.*?)`/g, '$1')
+               .replace(/\[(.*?)\]\([^)]*\)/g, '$1');
+  }
+
+  /**
+   * Extract all links from a line of content
+   */
+  extractLinksFromLine(line) {
+    const links = [];
+
+    // Extract Card components with href and title
+    const cardMatches = line.matchAll(/<Card[^>]+title=["']([^"']+)["'][^>]+href=["']([^"']+)["'][^>]*>/g);
+    for (const match of cardMatches) {
+      const title = match[1].trim();
+      const href = match[2].trim();
+      links.push({ title, href: this.normalizeUrl(href) });
+    }
+
+    // Extract Tooltip components with href and cta
+    const tooltipMatches = line.matchAll(/<Tooltip[^>]+cta=["']([^"']+)["'][^>]+href=["']([^"']+)["'][^>]*>/g);
+    for (const match of tooltipMatches) {
+      const title = match[1].trim();
+      const href = match[2].trim();
+      links.push({ title, href: this.normalizeUrl(href) });
+    }
+
+    // Extract regular markdown links [text](href)
+    const markdownMatches = line.matchAll(/\[([^\]]+)\]\(([^)]+)\)/g);
+    for (const match of markdownMatches) {
+      const title = match[1].trim();
+      const href = match[2].trim();
+      if (href.startsWith('/docs') || href.startsWith('docs/') || href.startsWith('http')) {
+        links.push({ title, href: this.normalizeUrl(href) });
+      }
+    }
+
+    // Extract any other href attributes
+    const hrefMatches = line.matchAll(/href=["']([^"']+)["']/g);
+    for (const match of hrefMatches) {
+      const href = match[1].trim();
+
+      // Skip if already captured above
+      if (!links.some(link => link.href === this.normalizeUrl(href))) {
+        // Try to extract title from nearby text
+        let title = href.split('/').pop().replace(/-/g, ' ');
+        links.push({ title, href: this.normalizeUrl(href) });
+      }
+    }
+
+    return links;
+  }
+
+  /**
+   * Normalize URL with base URL
+   */
+  normalizeUrl(href) {
+    if (href.startsWith('http')) {
+      return href;
+    }
+
+    if (href.startsWith('/')) {
+      return `${this.baseURL}${href}`;
+    }
+
+    if (href.startsWith('docs/')) {
+      return `${this.baseURL}/${href}`;
+    }
+
+    return href;
+  }
+
+  /**
+   * Format content for llms.txt output (matching expected format)
+   */
+  formatForOutput(fileData) {
+    let output = [];
+
+    // Add page header with title and URL
+    output.push(`# ${fileData.title}`);
+    output.push(`Source: ${fileData.url}`);
+    output.push('');
+
+    // Add sections with descriptions and links
+    if (fileData.sections.length > 0) {
+      for (let i = 0; i < fileData.sections.length; i++) {
+        const section = fileData.sections[i];
+
+        // Add section header with proper markdown level
+        const headerLevel = '#'.repeat(section.level);
+        output.push(`${headerLevel} ${section.name}`);
+        output.push('');
+
+        // Add description if present
+        if (section.description) {
+          output.push(`##### ${section.description}`);
+          output.push('');
+        }
+
+        // Add links as markdown list
+        if (section.links.length > 0) {
+          for (const link of section.links) {
+            output.push(`- [${link.title}](${link.href})`);
+          }
+          output.push('');
+        }
+
+        // Add separator between sections (except for last section)
+        if (i < fileData.sections.length - 1) {
+          output.push('***');
+          output.push('');
+        }
+      }
+
+      // Add final separator
+      output.push('---------');
+    }
+
+    return output.join('\n');
+  }
+
+  /**
+   * Main scraping method using docs.json
+   */
+  scrape() {
+    console.log('🔍 Parsing docs.json for rendered pages...');
+    const renderedPages = this.parseDocsJson();
+
+    if (renderedPages.length === 0) {
+      console.warn('No rendered pages found in docs.json');
+      return '';
+    }
+
+    let allOutput = [];
+    let processedCount = 0;
+    let errorCount = 0;
+
+    for (const pagePath of renderedPages) {
+      const filePath = this.pageToFilePath(pagePath);
+
+      if (!filePath) {
+        console.log(`⚠️  No file found for page: ${pagePath}`);
+        continue;
+      }
+
+      console.log(`📄 Processing: ${pagePath}`);
+
+      try {
+        const fileData = this.extractContent(filePath, pagePath);
+        const formattedOutput = this.formatForOutput(fileData);
+
+        if (formattedOutput.trim()) {
+          allOutput.push(formattedOutput);
+          processedCount++;
+        }
+      } catch (error) {
+        console.warn(`⚠️  Error processing ${pagePath}: ${error.message}`);
+        errorCount++;
+      }
+    }
+
+    console.log(`✅ Successfully processed ${processedCount} pages`);
+    if (errorCount > 0) {
+      console.log(`⚠️  Encountered errors in ${errorCount} pages`);
+    }
+
+    return allOutput.join('\n\n');
+  }
+
+  /**
+   * Generate llms.txt file
+   */
+  generateLLMSFile() {
+    console.log('🚀 Starting MDX scraping using docs.json...');
+
+    const scrapedContent = this.scrape();
+
+    if (!scrapedContent.trim()) {
+      console.error('❌ No content was scraped');
+      return null;
+    }
+
+    const outputPath = path.join(__dirname, 'llms.txt');
+
+    // Add header to the file
+    const header = `# Auth0 Documentation - LLMs.txt
+# Generated from rendered pages listed in docs.json
+# Structured format with sections, descriptions, and links
+# Base URL: ${this.baseURL}
+# Generated on: ${new Date().toISOString()}
+
+`;
+
+    const finalContent = header + scrapedContent;
+
+    fs.writeFileSync(outputPath, finalContent, 'utf-8');
+    console.log(`✅ Generated llms.txt with ${finalContent.length} characters`);
+    console.log(`📍 File saved to: ${outputPath}`);
+
+    return outputPath;
+  }
+}
+
+// Run the scraper if called directly
+if (require.main === module) {
+  const scraper = new MDXScraper();
+  scraper.generateLLMSFile();
+}
+
+module.exports = MDXScraper;
\ No newline at end of file