From 5614362ebc301fe82188a8ce28894561108586f9 Mon Sep 17 00:00:00 2001 From: Sahu97494 Date: Thu, 18 Sep 2025 15:24:11 -0700 Subject: [PATCH] llms.txt script You can test this out by dropping it into the main directory of the docs-v2 repo and running node scrape_mdx_to_llms.js to see the output in the llms.txt file that is generated. Expected outcome here is to scrape: the title each section section name each link link title link url --- main/scrape_mdx_to_llms.js | 408 +++++++++++++++++++++++++++++++++++++ 1 file changed, 408 insertions(+) create mode 100644 main/scrape_mdx_to_llms.js diff --git a/main/scrape_mdx_to_llms.js b/main/scrape_mdx_to_llms.js new file mode 100644 index 000000000..f69937a88 --- /dev/null +++ b/main/scrape_mdx_to_llms.js @@ -0,0 +1,408 @@ +#!/usr/bin/env node + +const fs = require('fs'); +const path = require('path'); + +/** + * Custom file scraper to generate llms.txt from .mdx files + * Uses docs.json to find rendered pages and extracts all content with sections + */ + +class MDXScraper { + constructor(baseURL = 'https://auth0-migration.mintlify.app') { + this.output = []; + this.mainFolder = path.join(__dirname, 'docs-v2', 'main'); + this.docsJsonPath = path.join(this.mainFolder, 'docs.json'); + this.renderedPages = new Set(); + this.baseURL = baseURL; + } + + /** + * Parse docs.json to get all pages that are actually rendered + */ + parseDocsJson() { + try { + const docsContent = fs.readFileSync(this.docsJsonPath, 'utf-8'); + const docsJson = JSON.parse(docsContent); + + // Recursive function to extract all page paths from the navigation structure + const extractPages = (obj) => { + if (typeof obj === 'string') { + // Direct page path + if (obj.startsWith('/docs') || obj.startsWith('docs/')) { + this.renderedPages.add(obj.replace(/^\//, '')); // Remove leading slash + } + return; + } + + if (Array.isArray(obj)) { + obj.forEach(extractPages); + return; + } + + if (typeof obj === 'object' && obj !== null) { + // Handle pages arrays + if (obj.pages) { + extractPages(obj.pages); + } + + // Handle other potential arrays with pages + Object.values(obj).forEach(value => { + if (Array.isArray(value) || (typeof value === 'object' && value !== null)) { + extractPages(value); + } + }); + } + }; + + // Start extraction from navigation + if (docsJson.navigation) { + extractPages(docsJson.navigation); + } + + console.log(`Found ${this.renderedPages.size} rendered pages in docs.json`); + return Array.from(this.renderedPages); + + } catch (error) { + console.warn(`Error parsing docs.json: ${error.message}`); + return []; + } + } + + /** + * Convert page path to file path + */ + pageToFilePath(pagePath) { + // Remove leading slash if present + const cleanPath = pagePath.replace(/^\//, ''); + + // Try different file extensions and variations + const possiblePaths = [ + path.join(this.mainFolder, `${cleanPath}.mdx`), + path.join(this.mainFolder, cleanPath, 'index.mdx'), + ]; + + for (const filePath of possiblePaths) { + if (fs.existsSync(filePath)) { + return filePath; + } + } + + return null; + } + + /** + * Extract content from MDX file with comprehensive section and link extraction + */ + extractContent(filePath, pagePath) { + const content = fs.readFileSync(filePath, 'utf-8'); + + // Extract frontmatter title + const frontmatterMatch = content.match(/---\n([\s\S]*?)\n---/); + let title = ''; + + if (frontmatterMatch) { + const frontmatter = frontmatterMatch[1]; + const titleMatch = frontmatter.match(/title:\s*['"]?([^'"\n]+)['"]?/); + if (titleMatch) { + title = titleMatch[1].trim(); + } + } + + // If no frontmatter title, try to find first h1 + if (!title) { + const h1Match = content.match(/^#\s+(.+)$/m); + if (h1Match) { + title = h1Match[1].trim(); + } + } + + // Use page path as fallback + if (!title) { + const pathParts = pagePath.replace(/^docs\//, '').split('/'); + title = pathParts[pathParts.length - 1].replace(/-/g, ' '); + } + + return { + title, + url: `${this.baseURL}/${pagePath}`, + pagePath, + sections: this.extractSectionsWithDescriptionsAndLinks(content) + }; + } + + /** + * Comprehensive extraction of sections with descriptions and links + */ + extractSectionsWithDescriptionsAndLinks(content) { + const sections = []; + const lines = content.split('\n'); + let currentSection = null; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + + // 1. Extract markdown headers (##, ###, ####) + if (line.match(/^#{2,4}\s+/)) { + if (currentSection) sections.push(currentSection); + + const level = (line.match(/^#+/)[0]).length; + const name = line.replace(/^#+\s+/, '').trim(); + + currentSection = { + level, + name: this.cleanHeaderText(name), + description: null, + links: [] + }; + } + + // 2. Extract HTML headers (

,

,

,

) + else if (line.match(/]*>([^<]+)<\/h[1-4]>/)) { + const match = line.match(/]*>([^<]+)<\/h[1-4]>/); + if (match) { + if (currentSection) sections.push(currentSection); + + const level = parseInt(match[1]) + 1; // Convert h1->##, h2->###, etc. + const name = match[2].trim(); + + currentSection = { + level, + name: this.cleanHeaderText(name), + description: null, + links: [] + }; + } + } + + // 3. Extract descriptions (#####) + else if (line.startsWith('##### ') && currentSection) { + currentSection.description = line.substring(6).trim(); + } + + // 4. Extract ALL href attributes and collect links + if (currentSection) { + const foundLinks = this.extractLinksFromLine(line); + currentSection.links.push(...foundLinks); + } + } + + if (currentSection) sections.push(currentSection); + return sections; + } + + /** + * Clean header text from markdown formatting + */ + cleanHeaderText(text) { + return text.replace(/\*\*(.*?)\*\*/g, '$1') + .replace(/\*(.*?)\*/g, '$1') + .replace(/`(.*?)`/g, '$1') + .replace(/\[(.*?)\]\([^)]*\)/g, '$1'); + } + + /** + * Extract all links from a line of content + */ + extractLinksFromLine(line) { + const links = []; + + // Extract Card components with href and title + const cardMatches = line.matchAll(/]+title=["']([^"']+)["'][^>]+href=["']([^"']+)["'][^>]*>/g); + for (const match of cardMatches) { + const title = match[1].trim(); + const href = match[2].trim(); + links.push({ title, href: this.normalizeUrl(href) }); + } + + // Extract Tooltip components with href and cta + const tooltipMatches = line.matchAll(/]+cta=["']([^"']+)["'][^>]+href=["']([^"']+)["'][^>]*>/g); + for (const match of tooltipMatches) { + const title = match[1].trim(); + const href = match[2].trim(); + links.push({ title, href: this.normalizeUrl(href) }); + } + + // Extract regular markdown links [text](href) + const markdownMatches = line.matchAll(/\[([^\]]+)\]\(([^)]+)\)/g); + for (const match of markdownMatches) { + const title = match[1].trim(); + const href = match[2].trim(); + if (href.startsWith('/docs') || href.startsWith('docs/') || href.startsWith('http')) { + links.push({ title, href: this.normalizeUrl(href) }); + } + } + + // Extract any other href attributes + const hrefMatches = line.matchAll(/href=["']([^"']+)["']/g); + for (const match of hrefMatches) { + const href = match[1].trim(); + + // Skip if already captured above + if (!links.some(link => link.href === this.normalizeUrl(href))) { + // Try to extract title from nearby text + let title = href.split('/').pop().replace(/-/g, ' '); + links.push({ title, href: this.normalizeUrl(href) }); + } + } + + return links; + } + + /** + * Normalize URL with base URL + */ + normalizeUrl(href) { + if (href.startsWith('http')) { + return href; + } + + if (href.startsWith('/')) { + return `${this.baseURL}${href}`; + } + + if (href.startsWith('docs/')) { + return `${this.baseURL}/${href}`; + } + + return href; + } + + /** + * Format content for llms.txt output (matching expected format) + */ + formatForOutput(fileData) { + let output = []; + + // Add page header with title and URL + output.push(`# ${fileData.title}`); + output.push(`Source: ${fileData.url}`); + output.push(''); + + // Add sections with descriptions and links + if (fileData.sections.length > 0) { + for (let i = 0; i < fileData.sections.length; i++) { + const section = fileData.sections[i]; + + // Add section header with proper markdown level + const headerLevel = '#'.repeat(section.level); + output.push(`${headerLevel} ${section.name}`); + output.push(''); + + // Add description if present + if (section.description) { + output.push(`##### ${section.description}`); + output.push(''); + } + + // Add links as markdown list + if (section.links.length > 0) { + for (const link of section.links) { + output.push(`- [${link.title}](${link.href})`); + } + output.push(''); + } + + // Add separator between sections (except for last section) + if (i < fileData.sections.length - 1) { + output.push('***'); + output.push(''); + } + } + + // Add final separator + output.push('---------'); + } + + return output.join('\n'); + } + + /** + * Main scraping method using docs.json + */ + scrape() { + console.log('🔍 Parsing docs.json for rendered pages...'); + const renderedPages = this.parseDocsJson(); + + if (renderedPages.length === 0) { + console.warn('No rendered pages found in docs.json'); + return ''; + } + + let allOutput = []; + let processedCount = 0; + let errorCount = 0; + + for (const pagePath of renderedPages) { + const filePath = this.pageToFilePath(pagePath); + + if (!filePath) { + console.log(`⚠️ No file found for page: ${pagePath}`); + continue; + } + + console.log(`📄 Processing: ${pagePath}`); + + try { + const fileData = this.extractContent(filePath, pagePath); + const formattedOutput = this.formatForOutput(fileData); + + if (formattedOutput.trim()) { + allOutput.push(formattedOutput); + processedCount++; + } + } catch (error) { + console.warn(`⚠️ Error processing ${pagePath}: ${error.message}`); + errorCount++; + } + } + + console.log(`✅ Successfully processed ${processedCount} pages`); + if (errorCount > 0) { + console.log(`⚠️ Encountered errors in ${errorCount} pages`); + } + + return allOutput.join('\n\n'); + } + + /** + * Generate llms.txt file + */ + generateLLMSFile() { + console.log('🚀 Starting MDX scraping using docs.json...'); + + const scrapedContent = this.scrape(); + + if (!scrapedContent.trim()) { + console.error('❌ No content was scraped'); + return null; + } + + const outputPath = path.join(__dirname, 'llms.txt'); + + // Add header to the file + const header = `# Auth0 Documentation - LLMs.txt +# Generated from rendered pages listed in docs.json +# Structured format with sections, descriptions, and links +# Base URL: ${this.baseURL} +# Generated on: ${new Date().toISOString()} + +`; + + const finalContent = header + scrapedContent; + + fs.writeFileSync(outputPath, finalContent, 'utf-8'); + console.log(`✅ Generated llms.txt with ${finalContent.length} characters`); + console.log(`📍 File saved to: ${outputPath}`); + + return outputPath; + } +} + +// Run the scraper if called directly +if (require.main === module) { + const scraper = new MDXScraper(); + scraper.generateLLMSFile(); +} + +module.exports = MDXScraper; \ No newline at end of file