Skip to content

Commit 5fe95f1

Browse files
committed
feat(crawl): --skip-sitemap
Fixes #10
1 parent 3c5f52c commit 5fe95f1

File tree

3 files changed

+36
-4
lines changed

3 files changed

+36
-4
lines changed

packages/crawl/src/cli.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ async function interactiveCrawl(): Promise<CrawlOptions | null> {
154154
],
155155
initialValues: ['llms.txt', 'llms-full.txt', 'markdown'],
156156
}),
157+
skipSitemap: () => p.confirm({
158+
message: 'Skip sitemap.xml and robots.txt discovery?',
159+
initialValue: false,
160+
}),
157161
},
158162
{
159163
onCancel: () => {
@@ -192,12 +196,16 @@ async function interactiveCrawl(): Promise<CrawlOptions | null> {
192196
`Max pages: Unlimited`,
193197
`Follow links: Yes (depth 3)`,
194198
`Output formats: ${outputFormats.join(', ')}`,
195-
`Sitemap discovery: Automatic`,
199+
`Sitemap discovery: ${advancedOptions.skipSitemap ? 'Skipped' : 'Automatic'}`,
196200
inferredOrigin && `Origin: ${inferredOrigin}`,
197201
].filter(Boolean)
198202

199203
p.note(summary.join('\n'), 'Crawl Configuration')
200204

205+
// Warn if using skip-sitemap with wildcard URLs in interactive mode
206+
if (advancedOptions.skipSitemap && globPatterns.some(p => p.isGlob)) {
207+
p.log.warn('Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.') }
208+
201209
return {
202210
urls,
203211
outputDir: resolve(outputDir),
@@ -211,6 +219,7 @@ async function interactiveCrawl(): Promise<CrawlOptions | null> {
211219
globPatterns,
212220
verbose: false,
213221
maxDepth: 3,
222+
skipSitemap: advancedOptions.skipSitemap,
214223
}
215224
}
216225

@@ -254,17 +263,19 @@ Options:
254263
--max-pages <number> Maximum pages to crawl (default: unlimited)
255264
--crawl-delay <seconds> Crawl delay in seconds
256265
--exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
266+
--skip-sitemap Skip sitemap.xml and robots.txt discovery
257267
-v, --verbose Enable verbose logging
258268
-h, --help Show this help message
259269
--version Show version number
260270
261-
Note: Sitemap discovery and robots.txt checking are automatic
271+
Note: Sitemap discovery and robots.txt checking are automatic unless --skip-sitemap is used.
262272
263273
Examples:
264274
@mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
265275
@mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
266276
@mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
267277
@mdream/crawl -u example.com --verbose
278+
@mdream/crawl -u example.com --skip-sitemap
268279
`)
269280
process.exit(0)
270281
}
@@ -417,6 +428,14 @@ Examples:
417428
// Check for verbose flag
418429
const verbose = args.includes('--verbose') || args.includes('-v')
419430

431+
// Check for skip-sitemap flag
432+
const skipSitemap = args.includes('--skip-sitemap')
433+
434+
// Warn if using skip-sitemap with wildcard URLs
435+
if (skipSitemap && parsed.isGlob) {
436+
p.log.warn('Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.')
437+
}
438+
420439
return {
421440
urls: [url],
422441
outputDir: resolve(getArgValue('--output') || getArgValue('-o') || 'output'),
@@ -434,6 +453,7 @@ Examples:
434453
crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : undefined,
435454
exclude: excludePatterns.length > 0 ? excludePatterns : undefined,
436455
verbose,
456+
skipSitemap,
437457
}
438458
}
439459

@@ -465,6 +485,7 @@ async function main() {
465485
`Depth: ${options.maxDepth}`,
466486
`Formats: ${formats.join(', ')}`,
467487
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(', ')}`,
488+
options.skipSitemap && `Skip sitemap: Yes`,
468489
options.verbose && `Verbose: Enabled`,
469490
].filter(Boolean)
470491

packages/crawl/src/crawl.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ export async function crawlAndGenerate(options: CrawlOptions, onProgress?: (prog
7272
siteNameOverride,
7373
descriptionOverride,
7474
verbose = false,
75+
skipSitemap = false,
7576
} = options
7677

7778
// Normalize and resolve the output directory
@@ -105,7 +106,7 @@ export async function crawlAndGenerate(options: CrawlOptions, onProgress?: (prog
105106
// Track sitemap discovery attempts
106107
const sitemapAttempts: { url: string, success: boolean, error?: string }[] = []
107108

108-
if (startingUrls.length > 0) {
109+
if (startingUrls.length > 0 && !skipSitemap) {
109110
const baseUrl = new URL(startingUrls[0]).origin
110111
const homePageUrl = baseUrl
111112

@@ -290,6 +291,15 @@ export async function crawlAndGenerate(options: CrawlOptions, onProgress?: (prog
290291
progress.crawling.total = startingUrls.length
291292
onProgress?.(progress)
292293
}
294+
else if (skipSitemap && startingUrls.length > 0) {
295+
// When skipping sitemap discovery, immediately mark as completed
296+
progress.sitemap.status = 'completed'
297+
progress.sitemap.found = 0
298+
progress.sitemap.processed = 0
299+
progress.crawling.total = startingUrls.length
300+
onProgress?.(progress)
301+
// Don't show any sitemap discovery box when skipping
302+
}
293303

294304
// Ensure output directory exists
295305
if (!existsSync(outputDir)) {
@@ -460,7 +470,7 @@ export async function crawlAndGenerate(options: CrawlOptions, onProgress?: (prog
460470
}
461471
},
462472
maxRequestsPerCrawl,
463-
respectRobotsTxtFile: true,
473+
respectRobotsTxtFile: !skipSitemap,
464474
}
465475

466476
// Add crawl delay if specified

packages/crawl/src/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ export interface CrawlOptions {
1717
siteNameOverride?: string
1818
descriptionOverride?: string
1919
verbose?: boolean
20+
skipSitemap?: boolean
2021
}
2122

2223
export interface ParsedUrlPattern {

0 commit comments

Comments
 (0)