Skip to content

Commit ca13789

Browse files
authored
Merge pull request #9 from AnswerDotAI/read
read_docs walks up parent paths if llms.txt not found.
2 parents bc6c377 + 5dbcf65 commit ca13789

File tree

2 files changed

+36
-9
lines changed

2 files changed

+36
-9
lines changed

03_download.ipynb

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
"metadata": {},
4646
"outputs": [],
4747
"source": [
48-
"from IPython.display import Markdown,HTML"
48+
"from IPython.display import Markdown,HTML\n",
49+
"from fastcore.test import *"
4950
]
5051
},
5152
{
@@ -236,7 +237,7 @@
236237
" url = (base+path+fname).strip('/')\n",
237238
" if fname=='/llms.txt': return url\n",
238239
" if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)\n",
239-
" if '.' in fname: return _tryget(url+'.md')\n",
240+
" if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])\n",
240241
" res = _tryget(url+'/llms.txt')\n",
241242
" if res: return res\n",
242243
" res = _tryget(url+'/index.md')\n",
@@ -245,7 +246,9 @@
245246
" if res: return res\n",
246247
" res = _tryget(url+'/index-commonmark.md')\n",
247248
" if res: return res\n",
248-
" return None"
249+
" parsed_url = urlparse(url)\n",
250+
" if parsed_url.path == '/' or not parsed_url.path: return None\n",
251+
" return find_docs(urljoin(url, '..'))"
249252
]
250253
},
251254
{
@@ -289,7 +292,7 @@
289292
"name": "stdout",
290293
"output_type": "stream",
291294
"text": [
292-
"None\n",
295+
"https://claudette.answer.ai/index.html.md\n",
293296
"https://claudette.answer.ai/index.html.md\n",
294297
"https://claudette.answer.ai/index.html.md\n",
295298
"https://llmstxt.org/llms.txt\n",
@@ -301,6 +304,30 @@
301304
"for o in urls: print(find_docs(o))"
302305
]
303306
},
307+
{
308+
"cell_type": "code",
309+
"execution_count": null,
310+
"id": "439546d4",
311+
"metadata": {},
312+
"outputs": [],
313+
"source": [
314+
"suffixes = [\"/\", \"/tmp\", \"/tmp/\", \"/tmp/tmp\", \"/tmp/tmp/\"]\n",
315+
"for suff in suffixes:\n",
316+
" for o in urls: test_eq(find_docs(o), find_docs(o+suff))"
317+
]
318+
},
319+
{
320+
"cell_type": "code",
321+
"execution_count": null,
322+
"id": "07d1b763",
323+
"metadata": {},
324+
"outputs": [],
325+
"source": [
326+
"test_eq(find_docs(\"https://github.com\"), \"https://github.com/llms.txt\")\n",
327+
"test_eq(find_docs(\"https://github.com/AnswerDotAI\"), \"https://github.com/llms.txt\")\n",
328+
"test_eq(find_docs(\"https://github.com/AnswerDotAI/\"), \"https://github.com/llms.txt\")"
329+
]
330+
},
304331
{
305332
"cell_type": "code",
306333
"execution_count": null,
@@ -312,7 +339,6 @@
312339
"def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):\n",
313340
" \"If available, return LLM-friendly llms.txt context or markdown file response for `url`\"\n",
314341
" url = find_docs(url)\n",
315-
" if not url: return\n",
316342
" if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)\n",
317343
" else: res = get(url).text\n",
318344
" return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)"

toolslm/download.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def find_docs(url):
7676
url = (base+path+fname).strip('/')
7777
if fname=='/llms.txt': return url
7878
if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)
79-
if '.' in fname: return _tryget(url+'.md')
79+
if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])
8080
res = _tryget(url+'/llms.txt')
8181
if res: return res
8282
res = _tryget(url+'/index.md')
@@ -85,13 +85,14 @@ def find_docs(url):
8585
if res: return res
8686
res = _tryget(url+'/index-commonmark.md')
8787
if res: return res
88-
return None
88+
parsed_url = urlparse(url)
89+
if parsed_url.path == '/' or not parsed_url.path: return None
90+
return find_docs(urljoin(url, '..'))
8991

90-
# %% ../03_download.ipynb 19
92+
# %% ../03_download.ipynb 21
9193
def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):
9294
"If available, return LLM-friendly llms.txt context or markdown file response for `url`"
9395
url = find_docs(url)
94-
if not url: return
9596
if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)
9697
else: res = get(url).text
9798
return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)

0 commit comments

Comments
 (0)