Skip to content

Commit 7c0a591

Browse files
committed
Add saveStorage option to workflow
1 parent d41f60d commit 7c0a591

File tree

7 files changed

+32
-2
lines changed

7 files changed

+32
-2
lines changed

backend/btrixcloud/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,8 @@ class RawCrawlConfig(BaseModel):
357357
selectLinks: List[str] = ["a[href]->href"]
358358
clickSelector: str = "a"
359359

360+
saveStorage: Optional[bool] = False
361+
360362

361363
# ============================================================================
362364
class CrawlConfigIn(BaseModel):

frontend/docs/docs/user-guide/workflow-setup.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,10 @@ This setting will only be shown if multiple different release channels are avail
256256

257257
Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler.
258258

259+
### Save Local and Session Storage
260+
261+
When enabled, instructs the crawler to save the browser's localStorage and sessionStorage data for each page in the web archive as part of the `WARC-JSON-Metadata` field. This option may be necessary to properly archive and replay certain websites. Use caution when sharing WACZ files created with this option enabled, as the saved browser storage may contain sensitive information.
262+
259263
### User Agent
260264

261265
Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) in outgoing requests to the specified value. If left blank, the crawler will use the Brave browser's default user agent. For a list of common user agents see [useragents.me](https://www.useragents.me/).

frontend/src/components/ui/config-details.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,10 @@ export class ConfigDetails extends BtrixElement {
262262
msg("Block Ads by Domain"),
263263
seedsConfig?.blockAds,
264264
)}
265+
${this.renderSetting(
266+
msg("Save Local and Session Storage"),
267+
seedsConfig?.saveStorage,
268+
)}
265269
${this.renderSetting(
266270
msg("User Agent"),
267271
seedsConfig?.userAgent

frontend/src/features/crawl-workflows/workflow-editor.ts

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1655,6 +1655,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
16551655
</sl-checkbox>
16561656
`)}
16571657
${this.renderHelpTextCol(infoTextFor["blockAds"], false)}
1658+
${inputCol(html`
1659+
<sl-checkbox name="saveStorage" ?checked=${this.formState.saveStorage}>
1660+
${msg("Save local and session storage")}
1661+
</sl-checkbox>
1662+
`)}
1663+
${this.renderHelpTextCol(infoTextFor["saveStorage"])}
16581664
${inputCol(html`
16591665
<sl-input
16601666
name="userAgent"
@@ -2631,7 +2637,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
26312637

26322638
private parseUrlListConfig(): Pick<
26332639
CrawlConfigParams["config"],
2634-
"seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed"
2640+
| "seeds"
2641+
| "scopeType"
2642+
| "extraHops"
2643+
| "useSitemap"
2644+
| "failOnFailedSeed"
2645+
| "saveStorage"
26352646
> {
26362647
const config = {
26372648
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
@@ -2642,14 +2653,15 @@ https://archiveweb.page/images/${"logo.svg"}`}
26422653
extraHops: this.formState.includeLinkedPages ? 1 : 0,
26432654
useSitemap: false,
26442655
failOnFailedSeed: this.formState.failOnFailedSeed,
2656+
saveStorage: this.formState.saveStorage,
26452657
};
26462658

26472659
return config;
26482660
}
26492661

26502662
private parseSeededConfig(): Pick<
26512663
CrawlConfigParams["config"],
2652-
"seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
2664+
"seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed" | "saveStorage"
26532665
> {
26542666
const primarySeedUrl = this.formState.primarySeedUrl;
26552667
const includeUrlList = this.formState.customIncludeUrlList
@@ -2680,6 +2692,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
26802692
scopeType: this.formState.scopeType as ScopeType,
26812693
useSitemap: this.formState.useSitemap,
26822694
failOnFailedSeed: false,
2695+
saveStorage: this.formState.saveStorage,
26832696
};
26842697
return config;
26852698
}

frontend/src/strings/crawl-workflows/infoText.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ export const infoTextFor = {
7575
customBehavior: msg(
7676
`Enable custom page actions with behavior scripts. You can specify any publicly accessible URL or public Git repository.`,
7777
),
78+
saveStorage: msg(
79+
`Save the browser's localStorage and sessionStorage in resulting web archive. Use caution sharing WACZs created with this option.`,
80+
),
7881
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
7982

8083
export default infoTextFor;

frontend/src/types/crawler.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ export type SeedConfig = Expand<
4747
selectLinks: string[];
4848
customBehaviors: string[];
4949
clickSelector: string;
50+
saveStorage?: boolean;
5051
}
5152
>;
5253

frontend/src/utils/workflow.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ export type FormState = {
132132
proxyId: string | null;
133133
selectLinks: string[];
134134
clickSelector: string;
135+
saveStorage: WorkflowParams["config"]["saveStorage"];
135136
};
136137

137138
export type FormStateField = keyof FormState;
@@ -189,6 +190,7 @@ export const getDefaultFormState = (): FormState => ({
189190
selectLinks: DEFAULT_SELECT_LINKS,
190191
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
191192
customBehavior: false,
193+
saveStorage: false,
192194
});
193195

194196
export const mapSeedToUrl = (arr: Seed[]) =>
@@ -346,6 +348,7 @@ export function getInitialFormState(params: {
346348
crawlerChannel:
347349
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
348350
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
351+
saveStorage: params.initialWorkflow.config.saveStorage,
349352
...formState,
350353
};
351354
}

0 commit comments

Comments
 (0)