@@ -154,6 +154,10 @@ async function interactiveCrawl(): Promise<CrawlOptions | null> {
154
154
] ,
155
155
initialValues : [ 'llms.txt' , 'llms-full.txt' , 'markdown' ] ,
156
156
} ) ,
157
+ skipSitemap : ( ) => p . confirm ( {
158
+ message : 'Skip sitemap.xml and robots.txt discovery?' ,
159
+ initialValue : false ,
160
+ } ) ,
157
161
} ,
158
162
{
159
163
onCancel : ( ) => {
@@ -192,12 +196,16 @@ async function interactiveCrawl(): Promise<CrawlOptions | null> {
192
196
`Max pages: Unlimited` ,
193
197
`Follow links: Yes (depth 3)` ,
194
198
`Output formats: ${ outputFormats . join ( ', ' ) } ` ,
195
- `Sitemap discovery: Automatic` ,
199
+ `Sitemap discovery: ${ advancedOptions . skipSitemap ? 'Skipped' : ' Automatic' } ` ,
196
200
inferredOrigin && `Origin: ${ inferredOrigin } ` ,
197
201
] . filter ( Boolean )
198
202
199
203
p . note ( summary . join ( '\n' ) , 'Crawl Configuration' )
200
204
205
+ // Warn if using skip-sitemap with wildcard URLs in interactive mode
206
+ if ( advancedOptions . skipSitemap && globPatterns . some ( p => p . isGlob ) ) {
207
+ p . log . warn ( 'Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.' ) }
208
+
201
209
return {
202
210
urls,
203
211
outputDir : resolve ( outputDir ) ,
@@ -211,6 +219,7 @@ async function interactiveCrawl(): Promise<CrawlOptions | null> {
211
219
globPatterns,
212
220
verbose : false ,
213
221
maxDepth : 3 ,
222
+ skipSitemap : advancedOptions . skipSitemap ,
214
223
}
215
224
}
216
225
@@ -254,17 +263,19 @@ Options:
254
263
--max-pages <number> Maximum pages to crawl (default: unlimited)
255
264
--crawl-delay <seconds> Crawl delay in seconds
256
265
--exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
266
+ --skip-sitemap Skip sitemap.xml and robots.txt discovery
257
267
-v, --verbose Enable verbose logging
258
268
-h, --help Show this help message
259
269
--version Show version number
260
270
261
- Note: Sitemap discovery and robots.txt checking are automatic
271
+ Note: Sitemap discovery and robots.txt checking are automatic unless --skip-sitemap is used.
262
272
263
273
Examples:
264
274
@mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
265
275
@mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
266
276
@mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
267
277
@mdream/crawl -u example.com --verbose
278
+ @mdream/crawl -u example.com --skip-sitemap
268
279
` )
269
280
process . exit ( 0 )
270
281
}
@@ -417,6 +428,14 @@ Examples:
417
428
// Check for verbose flag
418
429
const verbose = args . includes ( '--verbose' ) || args . includes ( '-v' )
419
430
431
+ // Check for skip-sitemap flag
432
+ const skipSitemap = args . includes ( '--skip-sitemap' )
433
+
434
+ // Warn if using skip-sitemap with wildcard URLs
435
+ if ( skipSitemap && parsed . isGlob ) {
436
+ p . log . warn ( 'Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.' )
437
+ }
438
+
420
439
return {
421
440
urls : [ url ] ,
422
441
outputDir : resolve ( getArgValue ( '--output' ) || getArgValue ( '-o' ) || 'output' ) ,
@@ -434,6 +453,7 @@ Examples:
434
453
crawlDelay : crawlDelayStr ? Number . parseInt ( crawlDelayStr ) : undefined ,
435
454
exclude : excludePatterns . length > 0 ? excludePatterns : undefined ,
436
455
verbose,
456
+ skipSitemap,
437
457
}
438
458
}
439
459
@@ -465,6 +485,7 @@ async function main() {
465
485
`Depth: ${ options . maxDepth } ` ,
466
486
`Formats: ${ formats . join ( ', ' ) } ` ,
467
487
options . exclude && options . exclude . length > 0 && `Exclude: ${ options . exclude . join ( ', ' ) } ` ,
488
+ options . skipSitemap && `Skip sitemap: Yes` ,
468
489
options . verbose && `Verbose: Enabled` ,
469
490
] . filter ( Boolean )
470
491
0 commit comments