Skip to content

Commit 6890878

Browse files
committed
fix(mdream): cleaner llms-full.txt output
1 parent 244b053 commit 6890878

File tree

3 files changed

+212
-14
lines changed

3 files changed

+212
-14
lines changed

packages/mdream/src/llms-txt.ts

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,48 @@ function generateLlmsTxtContent(files: ProcessedFile[], options: Pick<LlmsTxtArt
202202
return content
203203
}
204204

205+
/**
206+
* Parse frontmatter from markdown content
207+
*/
208+
function parseFrontmatter(content: string): { frontmatter: Record<string, any> | null, body: string } {
209+
const frontmatterRegex = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/
210+
const match = content.match(frontmatterRegex)
211+
212+
if (!match) {
213+
return { frontmatter: null, body: content }
214+
}
215+
216+
const frontmatterContent = match[1]
217+
const body = match[2]
218+
219+
const frontmatter: Record<string, any> = {}
220+
const lines = frontmatterContent.split('\n')
221+
222+
for (const line of lines) {
223+
const colonIndex = line.indexOf(':')
224+
if (colonIndex > 0) {
225+
const key = line.substring(0, colonIndex).trim()
226+
const value = line.substring(colonIndex + 1).trim()
227+
frontmatter[key] = value
228+
}
229+
}
230+
231+
return { frontmatter, body }
232+
}
233+
234+
/**
235+
* Serialize frontmatter object to YAML-like format
236+
*/
237+
function serializeFrontmatter(data: Record<string, any>): string {
238+
const lines: string[] = []
239+
for (const [key, value] of Object.entries(data)) {
240+
if (value !== undefined && value !== null) {
241+
lines.push(`${key}: ${String(value)}`)
242+
}
243+
}
244+
return lines.join('\n')
245+
}
246+
205247
/**
206248
* Generate llms-full.txt content with complete page content
207249
*/
@@ -229,16 +271,46 @@ function generateLlmsFullTxtContent(files: ProcessedFile[], options: Pick<LlmsTx
229271
const url = file.url.startsWith('http://') || file.url.startsWith('https://')
230272
? file.url
231273
: (origin ? origin + file.url : file.url)
232-
content += `## ${file.title}\n\n`
233-
content += `**URL:** ${url}\n`
274+
275+
// Parse existing frontmatter from content
276+
const { frontmatter, body } = parseFrontmatter(file.content)
277+
278+
// Prepare metadata to add
279+
const metadata: Record<string, any> = {
280+
title: file.title,
281+
url,
282+
}
283+
234284
if (file.filePath && options.outputDir) {
235-
const relativePath = relative(options.outputDir, file.filePath)
236-
content += `**File:** ${relativePath}\n`
285+
metadata.file = relative(options.outputDir, file.filePath)
237286
}
238287
else if (file.filePath) {
239-
content += `**File:** ${file.filePath}\n`
288+
metadata.file = file.filePath
289+
}
290+
291+
// Add any additional metadata from the file
292+
if (file.metadata) {
293+
if (file.metadata.description)
294+
metadata.description = file.metadata.description
295+
if (file.metadata.keywords)
296+
metadata.keywords = file.metadata.keywords
297+
if (file.metadata.author)
298+
metadata.author = file.metadata.author
240299
}
241-
content += `\n${file.content}\n\n---\n\n`
300+
301+
// Always include frontmatter for uniform formatting
302+
const mergedFrontmatter = frontmatter ? { ...frontmatter, ...metadata } : metadata
303+
const frontmatterString = serializeFrontmatter(mergedFrontmatter)
304+
let contentBody = frontmatter ? body : file.content
305+
306+
// Remove duplicate title from the beginning of content if it exists
307+
const titleLine = contentBody.trim().split('\n')[0]
308+
if (titleLine === file.title || titleLine === `# ${file.title}`) {
309+
// Remove the first line (title) and any following empty lines
310+
contentBody = contentBody.trim().split('\n').slice(1).join('\n').trimStart()
311+
}
312+
313+
content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`
242314
}
243315
}
244316

packages/mdream/test/unit/cli-llms.test.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,15 @@ it('should generate llms-full.txt with complete content', async () => {
6666
expect(fullContent).toContain('- [About Us - Test Site](#about-us---test-site)')
6767

6868
// Check full content is included
69-
expect(fullContent).toContain('## Test Site - Welcome to Our Homepage')
70-
expect(fullContent).toContain('**URL:** /')
71-
expect(fullContent).toContain('Welcome to Test Site')
69+
// Now using frontmatter format with title only in frontmatter
70+
expect(fullContent).toContain('---')
71+
expect(fullContent).toContain('title: Test Site - Welcome to Our Homepage')
72+
expect(fullContent).toContain('url: /')
73+
expect(fullContent).toContain('# Welcome to Test Site')
7274
expect(fullContent).toContain('Homepage Content')
7375

7476
// Check blog post content
75-
expect(fullContent).toContain('## First Blog Post - Test Site Blog')
77+
expect(fullContent).toContain('title: First Blog Post - Test Site Blog')
7678
expect(fullContent).toContain('First Blog Post: Testing HTML to Markdown')
7779
expect(fullContent).toContain('function convertHtmlToMarkdown')
7880
})

packages/mdream/test/unit/llms-txt.test.ts

Lines changed: 128 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
import type { ProcessedFile } from '../../src/llms-txt.ts'
12
import { mkdir, rm, writeFile } from 'node:fs/promises'
23
import { tmpdir } from 'node:os'
34
import { join } from 'node:path'
4-
import { expect, it } from 'vitest'
5+
import { describe, expect, it } from 'vitest'
56
import { generateLlmsTxtArtifacts } from '../../src/llms-txt.ts'
67

78
const testDir = join(tmpdir(), 'mdream-llms-txt-test')
@@ -108,9 +109,9 @@ it('should generate llms-full.txt when requested', async () => {
108109
expect(result.llmsFullTxt).toContain('# Test Site')
109110
expect(result.llmsFullTxt).toContain('> A test site for mdream')
110111
expect(result.llmsFullTxt).toContain('## Table of Contents')
111-
expect(result.llmsFullTxt).toContain('## Test Site - Home')
112-
expect(result.llmsFullTxt).toContain('**URL:** https://example.com/')
113-
expect(result.llmsFullTxt).toContain('Welcome to Test Site')
112+
expect(result.llmsFullTxt).toContain('title: Test Site - Home')
113+
expect(result.llmsFullTxt).toContain('url: https://example.com/')
114+
expect(result.llmsFullTxt).toContain('# Welcome to Test Site')
114115

115116
await cleanup()
116117
})
@@ -222,3 +223,126 @@ it('should work with pre-processed files', async () => {
222223
expect(result.llmsFullTxt).toContain('# Welcome')
223224
expect(result.llmsFullTxt).toContain('# About')
224225
})
226+
227+
describe('llms-txt frontmatter handling', () => {
228+
it('should prepend metadata to existing frontmatter', async () => {
229+
const filesWithFrontmatter: ProcessedFile[] = [{
230+
title: 'Test Page',
231+
content: `---
232+
existingKey: existingValue
233+
tags: test, sample
234+
---
235+
236+
# Content
237+
238+
This is the main content of the page.`,
239+
url: '/test-page',
240+
metadata: {
241+
description: 'A test page with frontmatter',
242+
author: 'Test Author',
243+
},
244+
}]
245+
246+
const result = await generateLlmsTxtArtifacts({
247+
files: filesWithFrontmatter,
248+
siteName: 'Test Site',
249+
origin: 'https://example.com',
250+
generateFull: true,
251+
})
252+
253+
expect(result.llmsFullTxt).toBeDefined()
254+
expect(result.llmsFullTxt).toContain('---')
255+
expect(result.llmsFullTxt).toContain('title: Test Page')
256+
expect(result.llmsFullTxt).toContain('url: https://example.com/test-page')
257+
expect(result.llmsFullTxt).toContain('existingKey: existingValue')
258+
expect(result.llmsFullTxt).toContain('tags: test, sample')
259+
expect(result.llmsFullTxt).toContain('description: A test page with frontmatter')
260+
expect(result.llmsFullTxt).toContain('author: Test Author')
261+
expect(result.llmsFullTxt).toContain('# Content')
262+
})
263+
264+
it('should add frontmatter to content without existing frontmatter', async () => {
265+
const filesWithoutFrontmatter: ProcessedFile[] = [{
266+
title: 'Simple Page',
267+
content: `# Simple Content
268+
269+
This page has no frontmatter.`,
270+
url: '/simple-page',
271+
}]
272+
273+
const result = await generateLlmsTxtArtifacts({
274+
files: filesWithoutFrontmatter,
275+
siteName: 'Test Site',
276+
origin: 'https://example.com',
277+
generateFull: true,
278+
})
279+
280+
expect(result.llmsFullTxt).toBeDefined()
281+
// Should now always have frontmatter with title and url
282+
expect(result.llmsFullTxt).toContain('---')
283+
expect(result.llmsFullTxt).toContain('title: Simple Page')
284+
expect(result.llmsFullTxt).toContain('url: https://example.com/simple-page')
285+
expect(result.llmsFullTxt).toContain('# Simple Content')
286+
// Should not have section header
287+
expect(result.llmsFullTxt).not.toContain('## Simple Page')
288+
})
289+
290+
it('should handle files with file paths and outputDir', async () => {
291+
const filesWithPaths: ProcessedFile[] = [{
292+
title: 'File Page',
293+
filePath: '/home/user/output/md/page.md',
294+
content: `---
295+
category: documentation
296+
---
297+
298+
# Documentation Page`,
299+
url: '/docs/page',
300+
}]
301+
302+
const result = await generateLlmsTxtArtifacts({
303+
files: filesWithPaths,
304+
siteName: 'Test Site',
305+
origin: 'https://example.com',
306+
outputDir: '/home/user/output',
307+
generateFull: true,
308+
})
309+
310+
expect(result.llmsFullTxt).toBeDefined()
311+
expect(result.llmsFullTxt).toContain('file: md/page.md')
312+
expect(result.llmsFullTxt).toContain('category: documentation')
313+
})
314+
315+
it('should merge metadata correctly with existing frontmatter', async () => {
316+
const filesWithConflict: ProcessedFile[] = [{
317+
title: 'Conflict Test',
318+
content: `---
319+
title: Original Title
320+
url: /old-url
321+
custom: value
322+
---
323+
324+
Content here`,
325+
url: '/new-url',
326+
metadata: {
327+
description: 'New description',
328+
},
329+
}]
330+
331+
const result = await generateLlmsTxtArtifacts({
332+
files: filesWithConflict,
333+
siteName: 'Test Site',
334+
origin: 'https://example.com',
335+
generateFull: true,
336+
})
337+
338+
expect(result.llmsFullTxt).toBeDefined()
339+
// New metadata should be prepended, existing should be preserved
340+
expect(result.llmsFullTxt).toContain('title: Conflict Test')
341+
expect(result.llmsFullTxt).toContain('url: https://example.com/new-url')
342+
expect(result.llmsFullTxt).toContain('description: New description')
343+
expect(result.llmsFullTxt).toContain('custom: value')
344+
// Original values that conflict should be overwritten
345+
expect(result.llmsFullTxt).not.toContain('title: Original Title')
346+
expect(result.llmsFullTxt).not.toContain('url: /old-url')
347+
})
348+
})

0 commit comments

Comments
 (0)