|
45 | 45 | "metadata": {},
|
46 | 46 | "outputs": [],
|
47 | 47 | "source": [
|
48 |
| - "from IPython.display import Markdown,HTML" |
| 48 | + "from IPython.display import Markdown,HTML\n", |
| 49 | + "from fastcore.test import *" |
49 | 50 | ]
|
50 | 51 | },
|
51 | 52 | {
|
|
236 | 237 | " url = (base+path+fname).strip('/')\n",
|
237 | 238 | " if fname=='/llms.txt': return url\n",
|
238 | 239 | " if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)\n",
|
239 |
| - " if '.' in fname: return _tryget(url+'.md')\n", |
| 240 | + " if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])\n", |
240 | 241 | " res = _tryget(url+'/llms.txt')\n",
|
241 | 242 | " if res: return res\n",
|
242 | 243 | " res = _tryget(url+'/index.md')\n",
|
|
245 | 246 | " if res: return res\n",
|
246 | 247 | " res = _tryget(url+'/index-commonmark.md')\n",
|
247 | 248 | " if res: return res\n",
|
248 |
| - " return None" |
| 249 | + " parsed_url = urlparse(url)\n", |
| 250 | + " if parsed_url.path == '/' or not parsed_url.path: return None\n", |
| 251 | + " return find_docs(urljoin(url, '..'))" |
249 | 252 | ]
|
250 | 253 | },
|
251 | 254 | {
|
|
289 | 292 | "name": "stdout",
|
290 | 293 | "output_type": "stream",
|
291 | 294 | "text": [
|
292 |
| - "None\n", |
| 295 | + "https://claudette.answer.ai/index.html.md\n", |
293 | 296 | "https://claudette.answer.ai/index.html.md\n",
|
294 | 297 | "https://claudette.answer.ai/index.html.md\n",
|
295 | 298 | "https://llmstxt.org/llms.txt\n",
|
|
301 | 304 | "for o in urls: print(find_docs(o))"
|
302 | 305 | ]
|
303 | 306 | },
|
| 307 | + { |
| 308 | + "cell_type": "code", |
| 309 | + "execution_count": null, |
| 310 | + "id": "439546d4", |
| 311 | + "metadata": {}, |
| 312 | + "outputs": [], |
| 313 | + "source": [ |
| 314 | + "suffixes = [\"/\", \"/tmp\", \"/tmp/\", \"/tmp/tmp\", \"/tmp/tmp/\"]\n", |
| 315 | + "for suff in suffixes:\n", |
| 316 | + " for o in urls: test_eq(find_docs(o), find_docs(o+suff))" |
| 317 | + ] |
| 318 | + }, |
| 319 | + { |
| 320 | + "cell_type": "code", |
| 321 | + "execution_count": null, |
| 322 | + "id": "07d1b763", |
| 323 | + "metadata": {}, |
| 324 | + "outputs": [], |
| 325 | + "source": [ |
| 326 | + "test_eq(find_docs(\"https://github.com\"), \"https://github.com/llms.txt\")\n", |
| 327 | + "test_eq(find_docs(\"https://github.com/AnswerDotAI\"), \"https://github.com/llms.txt\")\n", |
| 328 | + "test_eq(find_docs(\"https://github.com/AnswerDotAI/\"), \"https://github.com/llms.txt\")" |
| 329 | + ] |
| 330 | + }, |
304 | 331 | {
|
305 | 332 | "cell_type": "code",
|
306 | 333 | "execution_count": null,
|
|
312 | 339 | "def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):\n",
|
313 | 340 | " \"If available, return LLM-friendly llms.txt context or markdown file response for `url`\"\n",
|
314 | 341 | " url = find_docs(url)\n",
|
315 |
| - " if not url: return\n", |
316 | 342 | " if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)\n",
|
317 | 343 | " else: res = get(url).text\n",
|
318 | 344 | " return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)"
|
|
0 commit comments