Skip to content

Add Recersive fetch option, Add URL fetch parameter, Add save output option, Add duplicate remove entry option, fix tests to compatible with promises #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
/node_modules/
/coverage/

*.tgz
*.xml
*.txt
*.csv
38 changes: 31 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@
[![Build Status](https://api.travis-ci.org/Rowno/sitemap-urls.svg?branch=master)](https://travis-ci.org/Rowno/sitemap-urls)
[![Dependency Status](https://david-dm.org/Rowno/sitemap-urls/status.svg)](https://david-dm.org/Rowno/sitemap-urls)

Extract URLs from an XML sitemap.
Extract URLs Recersively from an XML sitemap.

![Sitemap Urls screenshot](screenshot.png)

## Features

- Variant Input like, File, URL, Piping
- Recursive extracting
- Save output to file
- Duplicate entry remove

## Getting Started

Install the Sitemap Urls command line tool:
Expand All @@ -17,7 +24,13 @@ npm install -g sitemap-urls
yarn add -g sitemap-urls
```

Run `sitemap-urls` on a file containing a sitemap:
Run `sitemap-urls` with a sitemap URL:

```bash
sitemap-urls -r https://example.com
```

also support file

```bash
sitemap-urls sitemap.xml
Expand All @@ -37,19 +50,30 @@ curl http://example.com/sitemap.xml | sitemap-urls
Usage: sitemap-urls <path> [<options>]

Path:
Path to a file containing an XML sitemap.
Path to a file containing an XML sitemap OR URL.
This parameter is ignored when the sitemap is being piped.

Options:
-h, --help Show this help text.
-v, --version Print sitemap-urls' version.
-r, --recursive Recursively fetch and extract urls
-o, --output Save output result to a file
-d, --duplicate Remove duplicate entry
-h, --help Show this help text.
-v, --version Print sitemap-urls' version.
```

### API

#### `.extractUrls(string xml)` -> `array`
```javascript
main(
isRecursive: boolean,
filename: boolean,
sitemapContent: string,
isDuplicate: boolean,
baseURL: string
}) -> array
```

Extracts URLs from a string containing an XML sitemap.
Extracts URLs Recersively from a string containing an XML sitemap OR URL.

Example result:

Expand Down
19 changes: 12 additions & 7 deletions lib/__tests__/index.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
'use strict'
const sitemapUrls = require('..')
const { main } = require('..')

describe('#extractUrls', () => {
test('should extract urls', () => {
const urls = sitemapUrls.extractUrls(`
test('should extract urls', async () => {
const urls = await main({
sitemapContent: `
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
Expand All @@ -15,13 +16,16 @@ describe('#extractUrls', () => {
<priority>1.0</priority>
</url>
</urlset>
`)
`
})

expect(urls).toEqual(['http://example.com/', 'http://example.com/test/'])
})

test('should not include duplicate urls', () => {
const urls = sitemapUrls.extractUrls(`
test('should not include duplicate urls', async () => {
const urls = await main({
isDuplicate: true,
sitemapContent: `
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
Expand All @@ -33,7 +37,8 @@ describe('#extractUrls', () => {
<priority>1.0</priority>
</url>
</urlset>
`)
`
})

expect(urls).toEqual(['http://example.com/'])
})
Expand Down
40 changes: 33 additions & 7 deletions lib/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,37 @@ const path = require('path')
const fs = require('fs')
const meow = require('meow')
const stdin = require('get-stdin')
const sitemapUrls = require('..')
const { main, isURL } = require('..')

const cli = meow(
`
Usage: sitemap-urls <path> [<options>]

Path:
Path to a file containing an XML sitemap.
Path to a file containing an XML sitemap OR URL.
This parameter is ignored when the sitemap is being piped.

Options:
-h, --help Show this help text.
-v, --version Print sitemap-urls' version.
-r, --recursive Recursively fetch and extract urls
-o, --output Save output result to a file
-d, --duplicate Remove duplicate entry
-h, --help Show this help text.
-v, --version Print sitemap-urls' version.
`,
{
flags: {
recursive: {
type: 'boolean',
alias: 'r'
},
output: {
type: 'string',
alias: 'o'
},
duplicate: {
type: 'boolean',
alias: 'd'
},
help: {
type: 'boolean',
alias: 'h'
Expand All @@ -33,17 +48,22 @@ Options:
}
)

stdin().then(stdinSitemap => {
stdin().then(async stdinSitemap => {
let filepath
let sitemap
let baseURL = ''

// Require stdin or file
if (!stdinSitemap && !cli.input[0]) {
cli.showHelp()
}

if (cli.input[0] && isURL(cli.input[0])) {
baseURL = cli.input[0]
}

// Try reading file if no stdin
if (stdinSitemap) {
if (baseURL || stdinSitemap) {
sitemap = stdinSitemap
} else {
filepath = path.resolve(cli.input[0])
Expand All @@ -55,7 +75,13 @@ stdin().then(stdinSitemap => {
sitemap = fs.readFileSync(filepath, { encoding: 'utf8' })
}

const urls = sitemapUrls.extractUrls(sitemap)
const urls = await main({
isRecursive: cli.flags.r,
filename: cli.flags.o,
sitemapContent: sitemap,
isDuplicate: cli.flags.d,
baseURL
})

urls.forEach(url => {
console.log(url)
Expand Down
89 changes: 88 additions & 1 deletion lib/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
'use strict'
const cheerio = require('cheerio')
const path = require('path')
const axios = require('axios')
const fs = require('fs')

async function fetchXML(url) {
const res = await axios(url)
return res.data
}

function isURLXML(url) {
const ext = path.extname(url)
return ext.toLocaleLowerCase() === '.xml'
}

function extractUrls(xml) {
const urls = []
Expand All @@ -16,4 +29,78 @@ function extractUrls(xml) {
return urls
}

exports.extractUrls = extractUrls
function saveOutput(urlsArray, filename) {
const file = fs.createWriteStream(filename)
file.on('error', function(err) {
console.log(`Error: ${err}`)
})
urlsArray.forEach(function(v) {
file.write(v + '\n')
})
file.end()
}

function isURL(str) {
try {
const myURL = new URL(str)
if (myURL.href) return true
} catch {
return false
}
}

async function main({
baseURL,
sitemapContent,
isRecursive,
filename,
isDuplicate
}) {
let output = []
let urls = []
let xml = null

if (baseURL) {
xml = await fetchXML(baseURL)
} else {
xml = sitemapContent
}

urls = extractUrls(xml)

if (isRecursive) {
console.log('Doing Recursive... Please wait...')
const pendingURLArray = urls.map(url => isURLXML(url) && fetchXML(url))
const newUrls = await axios.all(pendingURLArray).then(responseArr => {
return responseArr.map(xml => extractUrls(xml))
})
output = newUrls.reduce((acc, url) => {
acc.push(...url)
return acc
}, urls)
} else {
output = urls
}

if (isDuplicate) {
output = output.reduce(function(acc, url) {
if (!acc.includes(url)) {
acc.push(url)
}

return acc
}, [])
}

if (filename) {
saveOutput(output, filename)
console.log(`${output.length}, items saved at ${path.resolve(filename)}`)
}

return output
}

module.exports = {
main,
isURL
}
Loading