Merge pull request #9 from AnswerDotAI/read

jph00 · web-flow · commit ca13789f3905 · 2024-09-12T23:49:09.000+10:00
read_docs walks up parent paths if llms.txt not found.
diff --git a/03_download.ipynb b/03_download.ipynb
@@ -45,7 +45,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from IPython.display import Markdown,HTML"
+    "from IPython.display import Markdown,HTML\n",
+    "from fastcore.test import *"
    ]
   },
   {
@@ -236,7 +237,7 @@
     "    url = (base+path+fname).strip('/')\n",
     "    if fname=='/llms.txt': return url\n",
     "    if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)\n",
-    "    if '.' in fname: return _tryget(url+'.md')\n",
+    "    if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])\n",
     "    res = _tryget(url+'/llms.txt')\n",
     "    if res: return res\n",
     "    res = _tryget(url+'/index.md')\n",
@@ -245,7 +246,9 @@
     "    if res: return res\n",
     "    res = _tryget(url+'/index-commonmark.md')\n",
     "    if res: return res\n",
-    "    return None"
+    "    parsed_url = urlparse(url)\n",
+    "    if parsed_url.path == '/' or not parsed_url.path: return None\n",
+    "    return find_docs(urljoin(url, '..'))"
    ]
   },
   {
@@ -289,7 +292,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "None\n",
+      "https://claudette.answer.ai/index.html.md\n",
       "https://claudette.answer.ai/index.html.md\n",
       "https://claudette.answer.ai/index.html.md\n",
       "https://llmstxt.org/llms.txt\n",
@@ -301,6 +304,30 @@
     "for o in urls: print(find_docs(o))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "439546d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "suffixes = [\"/\", \"/tmp\", \"/tmp/\", \"/tmp/tmp\", \"/tmp/tmp/\"]\n",
+    "for suff in suffixes:\n",
+    "    for o in urls: test_eq(find_docs(o), find_docs(o+suff))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07d1b763",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_eq(find_docs(\"https://github.com\"), \"https://github.com/llms.txt\")\n",
+    "test_eq(find_docs(\"https://github.com/AnswerDotAI\"), \"https://github.com/llms.txt\")\n",
+    "test_eq(find_docs(\"https://github.com/AnswerDotAI/\"), \"https://github.com/llms.txt\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -312,7 +339,6 @@
     "def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):\n",
     "    \"If available, return LLM-friendly llms.txt context or markdown file response for `url`\"\n",
     "    url = find_docs(url)\n",
-    "    if not url: return\n",
     "    if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)\n",
     "    else: res = get(url).text\n",
     "    return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)"
diff --git a/toolslm/download.py b/toolslm/download.py
@@ -76,7 +76,7 @@ def find_docs(url):
     url = (base+path+fname).strip('/')
     if fname=='/llms.txt': return url
     if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)
-    if '.' in fname: return _tryget(url+'.md')
+    if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])
     res = _tryget(url+'/llms.txt')
     if res: return res
     res = _tryget(url+'/index.md')
@@ -85,13 +85,14 @@ def find_docs(url):
     if res: return res
     res = _tryget(url+'/index-commonmark.md')
     if res: return res
-    return None
+    parsed_url = urlparse(url)
+    if parsed_url.path == '/' or not parsed_url.path: return None
+    return find_docs(urljoin(url, '..'))
 
-# %% ../03_download.ipynb 19
+# %% ../03_download.ipynb 21
 def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):
     "If available, return LLM-friendly llms.txt context or markdown file response for `url`"
     url = find_docs(url)
-    if not url: return
     if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)
     else: res = get(url).text
     return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)