Rowno · devlifeX · Sep 15, 2021 · Sep 15, 2021 · Sep 15, 2021 · Sep 15, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,7 @@
 /node_modules/
 /coverage/
+
+*.tgz
+*.xml
+*.txt
+*.csv
diff --git a/README.md b/README.md
@@ -3,10 +3,17 @@
 [![Build Status](https://api.travis-ci.org/Rowno/sitemap-urls.svg?branch=master)](https://travis-ci.org/Rowno/sitemap-urls)
 [![Dependency Status](https://david-dm.org/Rowno/sitemap-urls/status.svg)](https://david-dm.org/Rowno/sitemap-urls)
 
-Extract URLs from an XML sitemap.
+Extract URLs Recersively from an XML sitemap.
 
 ![Sitemap Urls screenshot](screenshot.png)
 
+## Features
+
+- Variant Input like, File, URL, Piping
+- Recursive extracting
+- Save output to file
+- Duplicate entry remove
+
 ## Getting Started
 
 Install the Sitemap Urls command line tool:
@@ -17,7 +24,13 @@ npm install -g sitemap-urls
 yarn add -g sitemap-urls
 ```
 
-Run `sitemap-urls` on a file containing a sitemap:
+Run `sitemap-urls` with a sitemap URL:
+
+```bash
+sitemap-urls -r https://example.com
+```
+
+also support file
 
 ```bash
 sitemap-urls sitemap.xml
@@ -37,19 +50,30 @@ curl http://example.com/sitemap.xml | sitemap-urls
 Usage: sitemap-urls <path> [<options>]
 
 Path:
-    Path to a file containing an XML sitemap.
+    Path to a file containing an XML sitemap OR URL.
     This parameter is ignored when the sitemap is being piped.
 
 Options:
-    -h, --help      Show this help text.
-    -v, --version   Print sitemap-urls' version.
+    -r, --recursive      Recursively fetch and extract urls
+    -o, --output         Save output result to a file
+    -d, --duplicate      Remove duplicate entry
+    -h, --help           Show this help text.
+    -v, --version        Print sitemap-urls' version.
 ```
 
 ### API
 
-#### `.extractUrls(string xml)` -> `array`
+```javascript
+main(
+    isRecursive: boolean,
+    filename: boolean,
+    sitemapContent: string,
+    isDuplicate: boolean,
+    baseURL: string
+  }) -> array
+```
 
-Extracts URLs from a string containing an XML sitemap.
+Extracts URLs Recersively from a string containing an XML sitemap OR URL.
 
 Example result:
 

diff --git a/lib/__tests__/index.js b/lib/__tests__/index.js
@@ -1,9 +1,10 @@
 'use strict'
-const sitemapUrls = require('..')
+const { main } = require('..')
 
 describe('#extractUrls', () => {
-  test('should extract urls', () => {
-    const urls = sitemapUrls.extractUrls(`
+  test('should extract urls', async () => {
+    const urls = await main({
+      sitemapContent: `
       <?xml version="1.0" encoding="UTF-8"?>
       <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
         <url>
@@ -15,13 +16,16 @@ describe('#extractUrls', () => {
           <priority>1.0</priority>
         </url>
       </urlset>
-    `)
+    `
+    })
 
     expect(urls).toEqual(['http://example.com/', 'http://example.com/test/'])
   })
 
-  test('should not include duplicate urls', () => {
-    const urls = sitemapUrls.extractUrls(`
+  test('should not include duplicate urls', async () => {
+    const urls = await main({
+      isDuplicate: true,
+      sitemapContent: `
       <?xml version="1.0" encoding="UTF-8"?>
       <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
         <url>
@@ -33,7 +37,8 @@ describe('#extractUrls', () => {
           <priority>1.0</priority>
         </url>
       </urlset>
-    `)
+    `
+    })
 
     expect(urls).toEqual(['http://example.com/'])
   })

diff --git a/lib/cli.js b/lib/cli.js
@@ -5,22 +5,37 @@ const path = require('path')
 const fs = require('fs')
 const meow = require('meow')
 const stdin = require('get-stdin')
-const sitemapUrls = require('..')
+const { main, isURL } = require('..')
 
 const cli = meow(
   `
 Usage: sitemap-urls <path> [<options>]
 
 Path:
-    Path to a file containing an XML sitemap.
+    Path to a file containing an XML sitemap OR URL.
     This parameter is ignored when the sitemap is being piped.
 
 Options:
-    -h, --help      Show this help text.
-    -v, --version   Print sitemap-urls' version.
+    -r, --recursive      Recursively fetch and extract urls
+    -o, --output         Save output result to a file
+    -d, --duplicate      Remove duplicate entry
+    -h, --help           Show this help text.
+    -v, --version        Print sitemap-urls' version.
 `,
   {
     flags: {
+      recursive: {
+        type: 'boolean',
+        alias: 'r'
+      },
+      output: {
+        type: 'string',
+        alias: 'o'
+      },
+      duplicate: {
+        type: 'boolean',
+        alias: 'd'
+      },
       help: {
         type: 'boolean',
         alias: 'h'
@@ -33,17 +48,22 @@ Options:
   }
 )
 
-stdin().then(stdinSitemap => {
+stdin().then(async stdinSitemap => {
   let filepath
   let sitemap
+  let baseURL = ''
 
   // Require stdin or file
   if (!stdinSitemap && !cli.input[0]) {
     cli.showHelp()
   }
 
+  if (cli.input[0] && isURL(cli.input[0])) {
+    baseURL = cli.input[0]
+  }
+
   // Try reading file if no stdin
-  if (stdinSitemap) {
+  if (baseURL || stdinSitemap) {
     sitemap = stdinSitemap
   } else {
     filepath = path.resolve(cli.input[0])
@@ -55,7 +75,13 @@ stdin().then(stdinSitemap => {
     sitemap = fs.readFileSync(filepath, { encoding: 'utf8' })
   }
 
-  const urls = sitemapUrls.extractUrls(sitemap)
+  const urls = await main({
+    isRecursive: cli.flags.r,
+    filename: cli.flags.o,
+    sitemapContent: sitemap,
+    isDuplicate: cli.flags.d,
+    baseURL
+  })
 
   urls.forEach(url => {
     console.log(url)

diff --git a/lib/index.js b/lib/index.js
@@ -1,5 +1,18 @@
 'use strict'
 const cheerio = require('cheerio')
+const path = require('path')
+const axios = require('axios')
+const fs = require('fs')
+
+async function fetchXML(url) {
+  const res = await axios(url)
+  return res.data
+}
+
+function isURLXML(url) {
+  const ext = path.extname(url)
+  return ext.toLocaleLowerCase() === '.xml'
+}
 
 function extractUrls(xml) {
   const urls = []
@@ -16,4 +29,78 @@ function extractUrls(xml) {
   return urls
 }
 
-exports.extractUrls = extractUrls
+function saveOutput(urlsArray, filename) {
+  const file = fs.createWriteStream(filename)
+  file.on('error', function(err) {
+    console.log(`Error: ${err}`)
+  })
+  urlsArray.forEach(function(v) {
+    file.write(v + '\n')
+  })
+  file.end()
+}
+
+function isURL(str) {
+  try {
+    const myURL = new URL(str)
+    if (myURL.href) return true
+  } catch {
+    return false
+  }
+}
+
+async function main({
+  baseURL,
+  sitemapContent,
+  isRecursive,
+  filename,
+  isDuplicate
+}) {
+  let output = []
+  let urls = []
+  let xml = null
+
+  if (baseURL) {
+    xml = await fetchXML(baseURL)
+  } else {
+    xml = sitemapContent
+  }
+
+  urls = extractUrls(xml)
+
+  if (isRecursive) {
+    console.log('Doing Recursive... Please wait...')
+    const pendingURLArray = urls.map(url => isURLXML(url) && fetchXML(url))
+    const newUrls = await axios.all(pendingURLArray).then(responseArr => {
+      return responseArr.map(xml => extractUrls(xml))
+    })
+    output = newUrls.reduce((acc, url) => {
+      acc.push(...url)
+      return acc
+    }, urls)
+  } else {
+    output = urls
+  }
+
+  if (isDuplicate) {
+    output = output.reduce(function(acc, url) {
+      if (!acc.includes(url)) {
+        acc.push(url)
+      }
+
+      return acc
+    }, [])
+  }
+
+  if (filename) {
+    saveOutput(output, filename)
+    console.log(`${output.length}, items saved at ${path.resolve(filename)}`)
+  }
+
+  return output
+}
+
+module.exports = {
+  main,
+  isURL
+}