Skip to content

Commit cc3d7ec

Browse files
committed
define crawler with backend as queue storage
1 parent 658ce38 commit cc3d7ec

File tree

2 files changed

+275
-1
lines changed

2 files changed

+275
-1
lines changed

.eslintrc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,15 @@
2222
"no-console": [
2323
"off"
2424
]
25-
}
25+
},
26+
"overrides": [
27+
{
28+
"files": [
29+
"src/crawler/**/*.ts"
30+
],
31+
"rules": {
32+
"react/no-is-mounted": "off"
33+
}
34+
}
35+
]
2636
}

src/crawler/crawler.ts

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
import { CommandTypes, sendCommandToContent } from '@/bus/Command';
2+
3+
interface CrawlerState {
4+
isActive: boolean;
5+
nextProcessTime: number;
6+
rateLimit: number;
7+
}
8+
9+
interface QueueUrlsResponse {
10+
accepted: number;
11+
rejected: number;
12+
queueSize: number;
13+
crawledCount: number;
14+
}
15+
16+
interface NextUrlResponse {
17+
url: string;
18+
}
19+
20+
interface QueueUrlsRequest {
21+
urls: string[];
22+
sourceUrl: string;
23+
}
24+
25+
class Crawler {
26+
private readonly state: CrawlerState;
27+
private process: ( html: string ) => Promise< void >;
28+
29+
constructor() {
30+
this.state = {
31+
isActive: false,
32+
nextProcessTime: 0,
33+
rateLimit: 1.0, // pages per sec; 1.0 means 1000ms delay between delays
34+
};
35+
// Initialize with empty process function
36+
this.process = async () => {};
37+
}
38+
39+
private log(
40+
level: 'log' | 'warn' | 'error',
41+
...args: any[]
42+
): void {
43+
console[level](...args);
44+
}
45+
46+
// Allow setting the process function
47+
public setProcessFunction(
48+
processFn: ( html: string ) => Promise< void >
49+
): void {
50+
this.process = processFn;
51+
}
52+
53+
public async start(): Promise< void > {
54+
if ( this.state.isActive ) {
55+
this.log( 'log', 'Crawler already running' );
56+
return;
57+
}
58+
59+
this.state.isActive = true;
60+
this.log( 'log', 'Crawler started' );
61+
62+
while ( this.state.isActive ) {
63+
const next = await this.getNextUrl();
64+
if ( next ) {
65+
await this.processUrl( next );
66+
} else {
67+
this.state.isActive = false;
68+
this.log( 'log', 'Crawler finished' );
69+
}
70+
}
71+
}
72+
73+
private async processUrl( url: string ): Promise< void > {
74+
this.log( 'log', 'processing url', url );
75+
try {
76+
// Wait until we're allowed to process the next URL
77+
await this.waitForRateLimit();
78+
79+
await this.navigateToUrl( url );
80+
81+
// @TODO: Get the HTML content via bus?
82+
const html = document.documentElement.outerHTML;
83+
84+
// Process the page content
85+
await this.process( html );
86+
87+
// Extract and queue new URLs
88+
const links = this.extractLinks( html );
89+
await this.queueUrls( links, url );
90+
} catch ( error ) {
91+
this.log( 'error', 'Error processing URL', url, error );
92+
this.state.isActive = false;
93+
}
94+
}
95+
96+
private async waitForRateLimit(): Promise< void > {
97+
const now = Date.now();
98+
const delayMs = 1000 / this.state.rateLimit; // Convert rate limit to milliseconds between requests
99+
100+
if ( now < this.state.nextProcessTime ) {
101+
await new Promise( ( resolve ) =>
102+
setTimeout( resolve, this.state.nextProcessTime - now )
103+
);
104+
}
105+
106+
// Calculate next allowed process time using the delay
107+
this.state.nextProcessTime = now + delayMs;
108+
}
109+
110+
private extractLinks( htmlString: string ): string[] {
111+
// Create a DOM parser instance
112+
const parser = new DOMParser();
113+
114+
// Parse the HTML string into a document
115+
const doc = parser.parseFromString( htmlString, 'text/html' );
116+
117+
// Find all anchor tags
118+
const linkElements = doc.querySelectorAll( 'a' );
119+
120+
// Convert NodeList to Array and extract link data
121+
const links = Array.from( linkElements ).map( ( link ) => {
122+
// Get the href attribute
123+
const href = link.getAttribute( 'href' );
124+
125+
// Skip if no href, or it's a javascript: link or anchor link
126+
if (
127+
! href ||
128+
href.startsWith( 'javascript:' ) ||
129+
href.startsWith( '#' )
130+
) {
131+
return null;
132+
}
133+
134+
// Try to resolve relative URLs to absolute
135+
let absoluteUrl;
136+
try {
137+
absoluteUrl = new URL( href, window.location.origin ).href;
138+
} catch ( e ) {
139+
// If URL parsing fails, use the original href
140+
absoluteUrl = href;
141+
}
142+
143+
const isExternal = link.hostname !== window.location.hostname;
144+
if ( isExternal ) {
145+
return null;
146+
}
147+
148+
return absoluteUrl;
149+
} );
150+
151+
// Filter out null values and return unique links
152+
return links
153+
.filter( ( link ) => link !== null )
154+
.filter(
155+
( link, index, self ) =>
156+
index === self.findIndex( ( l ) => l === link )
157+
);
158+
}
159+
160+
private async queueUrls(
161+
urls: string[],
162+
sourceUrl: string,
163+
retryCount = 0,
164+
maxRetries = 5
165+
): Promise< QueueUrlsResponse > {
166+
const request: QueueUrlsRequest = {
167+
urls,
168+
sourceUrl,
169+
};
170+
171+
const response = await fetch( '/crawl-api/queue-urls', {
172+
method: 'POST',
173+
headers: { 'Content-Type': 'application/json' },
174+
body: JSON.stringify( request ),
175+
} );
176+
177+
if ( ! response.ok ) {
178+
this.log(
179+
'warn',
180+
`Attempt ${
181+
retryCount + 1
182+
}/${ maxRetries } failed: HTTP error! status: ${
183+
response.status
184+
}`
185+
);
186+
187+
if ( retryCount >= maxRetries - 1 ) {
188+
return Promise.reject(
189+
new Error(
190+
`Failed to queue URLs after ${ maxRetries } attempts`
191+
)
192+
);
193+
}
194+
195+
// Wait before retrying
196+
await this.sleep();
197+
198+
// Recursive call
199+
return this.queueUrls( urls, sourceUrl, retryCount++, maxRetries );
200+
}
201+
202+
return response.json();
203+
}
204+
205+
private async sleep( ms: number = 1000 ): Promise< void > {
206+
return new Promise( ( resolve ) => setTimeout( resolve, ms ) );
207+
}
208+
209+
private async getNextUrl(
210+
retryCount = 0,
211+
maxRetries = 5
212+
): Promise< string | null > {
213+
const response = await fetch( '/crawl-api/next-url' );
214+
215+
// crawling queue is finished
216+
if ( response.status === 204 ) {
217+
return null;
218+
}
219+
220+
if ( ! response.ok ) {
221+
this.log(
222+
'warn',
223+
`Attempt ${
224+
retryCount + 1
225+
}/${ maxRetries } failed: HTTP error! status: ${
226+
response.status
227+
}`
228+
);
229+
230+
if ( retryCount >= maxRetries - 1 ) {
231+
return Promise.reject(
232+
new Error(
233+
`Failed to get next URL after ${ maxRetries } attempts`
234+
)
235+
);
236+
}
237+
238+
// Wait before retrying
239+
await this.sleep();
240+
241+
// Recursive call
242+
return this.getNextUrl( retryCount++, maxRetries );
243+
}
244+
245+
const data: NextUrlResponse = await response.json();
246+
return data.url;
247+
}
248+
249+
private async navigateToUrl( url: string ): Promise< void > {
250+
void sendCommandToContent( {
251+
type: CommandTypes.NavigateTo,
252+
payload: { url },
253+
} );
254+
}
255+
256+
public stop(): void {
257+
this.state.isActive = false;
258+
}
259+
260+
public updateRateLimit( newLimit: number ): void {
261+
// only allow between 0.1 and 10 pages per second - no reason for this limit; feel free to change
262+
this.state.rateLimit = Math.max( 0.1, Math.min( 10.0, newLimit ) );
263+
}
264+
}

0 commit comments

Comments
 (0)