diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 9e9f49a..1b82b26 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,11 +1,11 @@ +name: Build + on: push: branches: - main pull_request: -name: Build - jobs: ci: name: CI @@ -17,7 +17,7 @@ jobs: steps: - name: Check out - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up ${{ matrix.toolchain }} Rust uses: dtolnay/rust-toolchain@master @@ -25,17 +25,7 @@ jobs: toolchain: ${{ matrix.toolchain }} - name: Set up Cache - uses: actions/cache@v3 - with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - ~/.cargo/.crates.toml - ~/.cargo/.crates2.json - key: ${{ runner.os }}-${{ matrix.toolchain }}-cargo-${{ hashFiles('**/Cargo.lock') }} - restore-keys: ${{ runner.os }}-${{ matrix.toolchain }}-cargo- + uses: Swatinem/rust-cache@v2 - name: Install Tarpaulin if: matrix.os == 'ubuntu-latest' diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 3062f4e..08d25dd 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -1,11 +1,8 @@ +name: Publish + on: - push: - branches: - - main workflow_dispatch: -name: Publish - jobs: cd: name: CD diff --git a/.gitignore b/.gitignore index 1103880..b4e5423 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # OS Thumbs.db .DS_Store +*.pdb # Editors .vs/ @@ -11,12 +12,9 @@ Thumbs.db # Lang: Rust debug/ target/ +Cargo.lock **/*.rs.bk -# Output -dist/ -build/ - # Environment env/ .env diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index 8d28a44..0000000 --- a/Cargo.lock +++ /dev/null @@ -1,625 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "addr2line" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "aho-corasick" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" -dependencies = [ - "memchr", -] - -[[package]] -name = "async-trait" -version = "0.1.77" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" - -[[package]] -name = "backtrace" -version = "0.3.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "bstr" -version = "1.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" -dependencies = [ - "memchr", - "regex-automata", - "serde", -] - -[[package]] -name = "bytes" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" - -[[package]] -name = "cc" -version = "1.0.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "countio" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deb137baf1ce7c00453689055b2a4311ed34d288cb2e66e892e3142e30dae67f" -dependencies = [ - "tokio", -] - -[[package]] -name = "countio" -version = "0.2.17" -dependencies = [ - "futures-io", - "futures-test", - "futures-util", - "tokio", -] - -[[package]] -name = "deranged" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" -dependencies = [ - "powerfmt", -] - -[[package]] -name = "form_urlencoded" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "futures-core" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" - -[[package]] -name = "futures-executor" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" - -[[package]] -name = "futures-macro" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" - -[[package]] -name = "futures-task" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" - -[[package]] -name = "futures-test" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce388237b32ac42eca0df1ba55ed3bbda4eaf005d7d4b5dbc0b20ab962928ac9" -dependencies = [ - "futures-core", - "futures-executor", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "futures-util", - "pin-project", - "pin-utils", -] - -[[package]] -name = "futures-util" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" -dependencies = [ - "futures-core", - "futures-io", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - -[[package]] -name = "gimli" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" - -[[package]] -name = "idna" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "isolang" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe50d48c77760c55188549098b9a7f6e37ae980c586a24693d6b01c3b2010c3c" -dependencies = [ - "phf", -] - -[[package]] -name = "itoa" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" - -[[package]] -name = "libc" -version = "0.2.153" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" - -[[package]] -name = "memchr" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" - -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - -[[package]] -name = "miniz_oxide" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" -dependencies = [ - "adler", -] - -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - -[[package]] -name = "num-conv" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" - -[[package]] -name = "object" -version = "0.32.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" -dependencies = [ - "memchr", -] - -[[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "phf" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "powerfmt" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" - -[[package]] -name = "proc-macro2" -version = "1.0.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quick-xml" -version = "0.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" -dependencies = [ - "memchr", - "tokio", -] - -[[package]] -name = "quote" -version = "1.0.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "regex" -version = "1.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" - -[[package]] -name = "robotxt" -version = "0.6.1" -dependencies = [ - "bstr", - "nom", - "percent-encoding", - "regex", - "serde", - "serde_json", - "thiserror", - "url", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" - -[[package]] -name = "ryu" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" - -[[package]] -name = "serde" -version = "1.0.197" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.197" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.114" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "siphasher" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" - -[[package]] -name = "sitemapo" -version = "0.2.0" -dependencies = [ - "async-trait", - "bytes", - "countio 0.2.15", - "isolang", - "quick-xml", - "thiserror", - "time", - "tokio", - "url", -] - -[[package]] -name = "slab" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] - -[[package]] -name = "syn" -version = "2.0.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "thiserror" -version = "1.0.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "time" -version = "0.3.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" -dependencies = [ - "deranged", - "itoa", - "num-conv", - "powerfmt", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" - -[[package]] -name = "time-macros" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" -dependencies = [ - "num-conv", - "time-core", -] - -[[package]] -name = "tinyvec" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - -[[package]] -name = "tokio" -version = "1.36.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" -dependencies = [ - "backtrace", - "bytes", - "pin-project-lite", - "tokio-macros", -] - -[[package]] -name = "tokio-macros" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "unicode-bidi" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "unicode-normalization" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "url" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", - "serde", -] diff --git a/Cargo.toml b/Cargo.toml index 388742f..12eebc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,8 +4,8 @@ resolver = "2" members = [ "./countio", - "./exclusion", - "./inclusion", + "./robotxt", + "./sitemapo", ] [workspace.package] @@ -14,15 +14,15 @@ authors = ["Oleh Martsokha "] license = "MIT" [workspace.dependencies] -tokio = { version = "1.36.0", default-features = false } -futures-io = { version = "0.3.30", default-features = false } -futures-util = { version = "0.3.30", default-features = false } -futures-test = { version = "0.3.30", default-features = false } +tokio = { version = "1", default-features = false } +futures-io = { version = "0.3", default-features = false } +futures-util = { version = "0.3", default-features = false } +futures-test = { version = "0.3", default-features = false } -url = { version = "2.5.0" } -async-trait = { version = "0.1.77" } -thiserror = { version = "1.0.57" } +url = { version = "2.5" } +async-trait = { version = "0.1" } +thiserror = { version = "1.0" } -serde = { version = "1.0.197" } -serde_json = { version = "1.0.114" } -time = { version = "0.3.34", default-features = false } +serde = { version = "1.0" } +serde_json = { version = "1.0" } +time = { version = "0.3", default-features = false } diff --git a/LICENSE.txt b/LICENSE.txt index 5817ff0..e481219 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Ethical Web Scraping +Copyright (c) 2023 spire-rs Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 312c66b..f0aa6ac 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,11 @@ #### Crates: -- [countio]('./countio/): The wrapper struct to enable byte counting for - std::io::Read and std::io::Write and its asynchronous variants from futures - and tokio. -- [robotxt]('./exclusion/): The implementation of the Robots.txt (or URL - exclusion) protocol with the support of crawl-delay, sitemap and universal - match extensions. -- [sitemapo]('./inclusion/): The implementation of the Sitemap (or URL - inclusion) protocol with the support of txt, xml formats and video, image, and - news extensions. +- [countio](./countio/): The wrapper struct to enable byte counting for + `std::io::{Read, Write, Seek}` and its async variants from `futures` and + `tokio`. +- [robotxt](./robotxt/): The implementation of the Robots.txt (or URL exclusion) + protocol with the support of `crawl-delay`, `sitemap` and universal `*` match + extensions. +- [sitemapo](./sitemapo/): The implementation of the Sitemap (or URL inclusion) + protocol with the support of txt and xml formats. diff --git a/countio/Cargo.toml b/countio/Cargo.toml index 741db32..7bf21ef 100644 --- a/countio/Cargo.toml +++ b/countio/Cargo.toml @@ -2,15 +2,15 @@ [package] name = "countio" -version = "0.2.17" +version = "0.2.18" readme = "./README.md" edition = { workspace = true } license = { workspace = true } authors = { workspace = true } -repository = "https://github.com/spire-rs/kit/countio" -homepage = "https://github.com/spire-rs/kit/countio" +repository = "https://github.com/spire-rs/kit" +homepage = "https://github.com/spire-rs/kit" documentation = "https://docs.rs/countio" categories = ["parsing", "asynchronous"] keywords = ["byte", "tokio", "futures", "parsing"] @@ -35,10 +35,10 @@ tokio = ["dep:tokio"] futures = ["dep:futures-io"] [dependencies] -tokio = { workspace = true, optional = true } -futures-io = { workspace = true, optional = true, features = ["std"] } +tokio = { version = "1", default-features = false, optional = true } +futures-io = { version = "0.3", default-features = false, optional = true, features = ["std"] } [dev-dependencies] -tokio = { workspace = true, features = ["rt", "macros", "io-util"] } -futures-util = { workspace = true } -futures-test = { workspace = true, features = ["std"] } +tokio = { version = "1", features = ["rt", "macros", "io-util"] } +futures-util = { version = "0.3", default-features = false } +futures-test = { version = "0.3", default-features = false, features = ["std"] } diff --git a/countio/README.md b/countio/README.md index 7a75754..7b56a96 100644 --- a/countio/README.md +++ b/countio/README.md @@ -5,8 +5,7 @@ [![Crate Version][crates-badge]][crates-url] [![Crate Coverage][coverage-badge]][coverage-url] -**Also check out other `spire-rs` projects -[here](https://github.com/spire-rs).** +**Check out other `spire` projects [here](https://github.com/spire-rs).** [action-badge]: https://img.shields.io/github/actions/workflow/status/spire-rs/kit/build.yaml?branch=main&label=build&logo=github&style=flat-square [action-url]: https://github.com/spire-rs/kit/actions/workflows/build.yaml @@ -17,9 +16,8 @@ [coverage-badge]: https://img.shields.io/codecov/c/github/spire-rs/kit?logo=codecov&logoColor=white&style=flat-square [coverage-url]: https://app.codecov.io/gh/spire-rs/kit -The wrapper struct to enable byte counting for `std::io::Read`, -`std::io::Write`, `std::io::Seek` and its asynchronous variants from `futures` -and `tokio` crates. +The wrapper struct to enable byte counting for `std::io::{Read, Write, Seek}` +and its asynchronous variants from `futures` and `tokio` crates. ### Features diff --git a/countio/counter/futures.rs b/countio/counter/futures.rs index e22590f..545bd3e 100644 --- a/countio/counter/futures.rs +++ b/countio/counter/futures.rs @@ -17,7 +17,7 @@ impl AsyncRead for Counter { let pin = Pin::new(&mut counter.inner); let poll = pin.poll_read(ctx, buf); if let Poll::Ready(Ok(bytes)) = poll { - counter.reader_bytes += bytes + counter.reader_bytes += bytes; } poll @@ -48,7 +48,7 @@ impl AsyncWrite for Counter { let poll = pin.poll_write(ctx, buf); if let Poll::Ready(Ok(bytes)) = poll { - counter.writer_bytes += bytes + counter.writer_bytes += bytes; } poll diff --git a/countio/counter/mod.rs b/countio/counter/mod.rs index b3cfc0b..04de808 100644 --- a/countio/counter/mod.rs +++ b/countio/counter/mod.rs @@ -13,18 +13,22 @@ mod tokio; /// The `Counter` struct adds byte counting to any reader or writer. pub struct Counter { pub(crate) inner: D, + /// Total bytes read from the `inner` reader. pub(crate) reader_bytes: usize, + /// Total bytes written to the `inner` writer. pub(crate) writer_bytes: usize, } impl Counter { /// Creates a new `Counter` with zero read/written bytes. - pub fn new(inner: D) -> Self { + #[inline] + pub const fn new(inner: D) -> Self { Self::with_bytes(0, 0, inner) } /// Creates a new `Counter` with the specified read/written bytes. - pub fn with_bytes(reader_bytes: usize, writer_bytes: usize, inner: D) -> Self { + #[inline] + pub const fn with_bytes(reader_bytes: usize, writer_bytes: usize, inner: D) -> Self { Self { inner, reader_bytes, @@ -33,37 +37,44 @@ impl Counter { } /// Returns the sum of read and written bytes by the underlying reader/writer. - pub fn total_bytes(&self) -> usize { + #[inline] + pub const fn total_bytes(&self) -> usize { self.reader_bytes + self.writer_bytes } /// Returns the total amount of read bytes by the underlying reader. - pub fn reader_bytes(&self) -> usize { + #[inline] + pub const fn reader_bytes(&self) -> usize { self.reader_bytes } /// Returns the total amount of written bytes by the underlying writer. - pub fn writer_bytes(&self) -> usize { + #[inline] + pub const fn writer_bytes(&self) -> usize { self.writer_bytes } /// Consumes `Counter` returning the underlying reader/writer. + #[inline] pub fn into_inner(self) -> D { self.inner } /// Gets a reference to the underlying reader/writer. + #[inline] pub fn get_ref(&self) -> &D { &self.inner } /// Gets a mutable reference to the underlying reader/writer. + #[inline] pub fn get_mut(&mut self) -> &mut D { &mut self.inner } } impl From for Counter { + #[inline] fn from(inner: D) -> Self { Self::new(inner) } diff --git a/countio/counter/stdlib.rs b/countio/counter/stdlib.rs index dde59fa..edcf7a4 100644 --- a/countio/counter/stdlib.rs +++ b/countio/counter/stdlib.rs @@ -19,7 +19,7 @@ impl BufRead for Counter { fn consume(&mut self, amt: usize) { self.reader_bytes += amt; - self.inner.consume(amt) + self.inner.consume(amt); } } @@ -30,12 +30,14 @@ impl Write for Counter { Ok(bytes) } + #[inline] fn flush(&mut self) -> Result<()> { self.inner.flush() } } impl Seek for Counter { + #[inline] fn seek(&mut self, pos: SeekFrom) -> Result { self.inner.seek(pos) } diff --git a/countio/counter/tokio.rs b/countio/counter/tokio.rs index 17e74b5..2b7ddc0 100644 --- a/countio/counter/tokio.rs +++ b/countio/counter/tokio.rs @@ -20,8 +20,8 @@ impl AsyncRead for Counter { let poll = pin.poll_read(ctx, buf); let bytes = buf.filled().len() - bytes; - if let Poll::Ready(Ok(())) = poll { - counter.reader_bytes += bytes + if matches!(poll, Poll::Ready(Ok(()))) { + counter.reader_bytes += bytes; } poll @@ -53,7 +53,7 @@ impl AsyncWrite for Counter { let poll = pin.poll_write(ctx, buf); if let Poll::Ready(Ok(bytes)) = poll { - counter.writer_bytes += bytes + counter.writer_bytes += bytes; } poll diff --git a/countio/lib.rs b/countio/lib.rs index a15f8ef..0f66732 100644 --- a/countio/lib.rs +++ b/countio/lib.rs @@ -8,5 +8,5 @@ mod counter; #[doc(hidden)] pub mod prelude { - pub use super::counter::Counter; + pub use super::Counter; } diff --git a/inclusion/build/auto.rs b/inclusion/build/auto.rs deleted file mode 100644 index c85dce1..0000000 --- a/inclusion/build/auto.rs +++ /dev/null @@ -1,101 +0,0 @@ -use url::Url; - -use crate::build::{EntryBuilder, IndexBuilder}; -use crate::record::Entry; -use crate::Error; - -/// TODO: Desc. -/// -/// Automatic sitemap file constructor. -/// NOTE: Does not deduplicate records. -/// -/// ```rust -/// #[derive(Debug, thiserror::Error)] -/// enum CustomError { -/// // .. -/// #[error("sitemap error: {0}")] -/// Sitemap(#[from] sitemapo::Error), -/// //.. -/// } -/// -/// fn main() -> Result<(), CustomError> { -/// Ok(()) -/// } -/// ``` -pub struct AutoBuilder { - index: Option>, - entry: Vec>, - queue: Vec, - // factory: impl Fn() -> W, -} - -impl AutoBuilder { - /// TODO: Desc. - pub fn new() -> Self { - todo!() - } -} - -impl AutoBuilder -where - W: std::io::Write, -{ - /// TODO: Desc. - pub fn try_sync(&mut self, fetcher: A) -> Result<(), E> - where - E: std::error::Error + From, - A: Fn(Url) -> Result, E>, - { - // if let Some(builder) = self.entry.as_mut() { - // builder.write(record) - // } - - todo!() - } -} - -#[cfg(feature = "tokio")] -#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] -impl AutoBuilder -where - W: tokio::io::AsyncWrite + Unpin + Send, -{ - /// TODO: Desc. - pub async fn try_async(&mut self) -> Result<(), Error> { - todo!() - } -} - -impl std::fmt::Debug for AutoBuilder { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // TODO: Debug. - f.debug_struct("AutoBuilder").finish() - } -} - -// impl Default for AutoBuilder { -// fn default() -> Self { -// Self { -// entry: None, -// index: None, -// } -// } -// } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn sync() -> Result<(), Error> { - // TODO: Test. - Ok(()) - } - - #[cfg(feature = "tokio")] - #[tokio::test] - async fn asynk() -> Result<(), Error> { - // TODO: Test. - Ok(()) - } -} diff --git a/exclusion/Cargo.toml b/robotxt/Cargo.toml similarity index 62% rename from exclusion/Cargo.toml rename to robotxt/Cargo.toml index 27b3388..8d2abfe 100644 --- a/exclusion/Cargo.toml +++ b/robotxt/Cargo.toml @@ -2,18 +2,18 @@ [package] name = "robotxt" -version = "0.6.1" +version = "0.6.2" readme = "./README.md" edition = { workspace = true } license = { workspace = true } authors = { workspace = true } -repository = "https://github.com/spire-rs/kit/exclusion" -homepage = "https://github.com/spire-rs/kit/exclusion" +repository = "https://github.com/spire-rs/kit" +homepage = "https://github.com/spire-rs/kit" documentation = "https://docs.rs/robotxt" categories = ["asynchronous", "web-programming"] -keywords = ["crawler", "scraper", "web", "framework"] +keywords = ["robots", "robot", "exclusion", "crawler", "scraper"] description = """ The implementation of the Robots.txt (or URL exclusion) protocol with the support of crawl-delay, sitemap and universal match extensions. @@ -45,14 +45,14 @@ optimal = [] serde = ["dep:serde", "url/serde", "serde/derive", "serde/rc"] [dependencies] -url = { workspace = true } -thiserror = { workspace = true } -percent-encoding = { version = "2.3.1" } +url = { version = "2.5" } +thiserror = { version = "1.0" } +percent-encoding = { version = "2.3" } -nom = { version = "7.1.3", optional = true } -bstr = { version = "1.9.1", optional = true } -regex = { version = "1.10.3", optional = true } -serde = { workspace = true, optional = true } +nom = { version = "7.1", optional = true } +bstr = { version = "1.9", optional = true } +regex = { version = "1.10", optional = true } +serde = { version = "1.0", optional = true } [dev-dependencies] -serde_json = { workspace = true } +serde_json = { version = "1.0" } diff --git a/exclusion/README.md b/robotxt/README.md similarity index 64% rename from exclusion/README.md rename to robotxt/README.md index 290bdf6..88c5722 100644 --- a/exclusion/README.md +++ b/robotxt/README.md @@ -5,8 +5,7 @@ [![Crate Version][crates-badge]][crates-url] [![Crate Coverage][coverage-badge]][coverage-url] -**Also check out other `spire-rs` projects -[here](https://github.com/spire-rs).** +**Check out other `spire` projects [here](https://github.com/spire-rs).** [action-badge]: https://img.shields.io/github/actions/workflow/status/spire-rs/kit/build.yaml?branch=main&label=build&logo=github&style=flat-square [action-url]: https://github.com/spire-rs/kit/actions/workflows/build.yaml @@ -38,48 +37,43 @@ programming language with the support of `crawl-delay`, `sitemap` and universal ```rust use robotxt::Robots; -fn main() { - let txt = r#" - User-Agent: foobot - Disallow: * - Allow: /example/ - Disallow: /example/nope.txt - "#; - - let r = Robots::from_bytes(txt.as_bytes(), "foobot"); - assert!(r.is_relative_allowed("/example/yeah.txt")); - assert!(!r.is_relative_allowed("/example/nope.txt")); - assert!(!r.is_relative_allowed("/invalid/path.txt")); -} +let txt = r#" + User-Agent: foobot + Disallow: * + Allow: /example/ + Disallow: /example/nope.txt +"#; + +let r = Robots::from_bytes(txt.as_bytes(), "foobot"); +assert!(r.is_relative_allowed("/example/yeah.txt")); +assert!(!r.is_relative_allowed("/example/nope.txt")); +assert!(!r.is_relative_allowed("/invalid/path.txt")); ``` - build the new `robots.txt` file in a declarative manner: ```rust -use robotxt::RobotsBuilder; - -fn main() -> Result<(), url::ParseError> { - let txt = RobotsBuilder::default() - .header("Robots.txt: Start") - .group(["foobot"], |u| { - u.crawl_delay(5) - .header("Rules for Foobot: Start") - .allow("/example/yeah.txt") - .disallow("/example/nope.txt") - .footer("Rules for Foobot: End") - }) - .group(["barbot", "nombot"], |u| { - u.crawl_delay(2) - .disallow("/example/yeah.txt") - .disallow("/example/nope.txt") - }) - .sitemap("https://example.com/sitemap_1.xml".try_into()?) - .sitemap("https://example.com/sitemap_1.xml".try_into()?) - .footer("Robots.txt: End"); - - println!("{}", txt.to_string()); - Ok(()) -} +use robotxt::{RobotsBuilder, Result}; + +let txt = RobotsBuilder::default() + .header("Robots.txt: Start") + .group(["foobot"], |u| { + u.crawl_delay(5) + .header("Rules for Foobot: Start") + .allow("/example/yeah.txt") + .disallow("/example/nope.txt") + .footer("Rules for Foobot: End") + }) + .group(["barbot", "nombot"], |u| { + u.crawl_delay(2) + .disallow("/example/yeah.txt") + .disallow("/example/nope.txt") + }) + .sitemap("https://example.com/sitemap_1.xml".try_into()?) + .sitemap("https://example.com/sitemap_1.xml".try_into()?) + .footer("Robots.txt: End"); + +println!("{}", txt.to_string()); ``` ### Links diff --git a/exclusion/build/group.rs b/robotxt/build/group.rs similarity index 93% rename from exclusion/build/group.rs rename to robotxt/build/group.rs index 57f1e2d..1cdb84a 100644 --- a/exclusion/build/group.rs +++ b/robotxt/build/group.rs @@ -6,7 +6,8 @@ use crate::paths::normalize_path; /// The single formatted `user-agent` group. /// -/// See [crate::RobotsBuilder::group]. +/// See [`crate::RobotsBuilder::group`]. +#[must_use] #[derive(Debug, Default, Clone)] pub struct GroupBuilder { user_agents: HashSet, @@ -84,7 +85,7 @@ impl GroupBuilder { /// u.crawl_delay(5) /// }); /// ``` - pub fn crawl_delay(mut self, delay: u16) -> Self { + pub const fn crawl_delay(mut self, delay: u16) -> Self { self.delay = Some(delay); self } @@ -143,9 +144,10 @@ impl Display for GroupBuilder { // Explicit Allow: * if no Disallows. // Used to interrupt the user-group i.e. // user-agent: a ..no rules.. user-agent: b - match self.rules_disallow.is_empty() { - true => Some("Allow: *".to_string()), - false => None, + if self.rules_disallow.is_empty() { + Some("Allow: *".to_string()) + } else { + None } } else { let rd = self.rules_allow.iter(); @@ -154,7 +156,7 @@ impl Display for GroupBuilder { }; let result = [header, agents, delay, disallows, allows, footer]; - let result = result.iter().filter_map(|u| u.clone()); + let result = result.iter().filter_map(Clone::clone); let result = result.collect::>().join("\n"); write!(f, "{}", result.as_str()) } diff --git a/exclusion/build/mod.rs b/robotxt/build/mod.rs similarity index 96% rename from exclusion/build/mod.rs rename to robotxt/build/mod.rs index e2a3046..c9a8332 100644 --- a/exclusion/build/mod.rs +++ b/robotxt/build/mod.rs @@ -11,6 +11,7 @@ mod split; /// The set of formatted `user-agent` groups that can be written /// in the `robots.txt` compliant format. +#[must_use] #[derive(Debug, Default, Clone)] pub struct RobotsBuilder { groups: Vec, @@ -106,11 +107,11 @@ impl fmt::Display for RobotsBuilder { let header = self.header.as_ref().map(|h| format_comment(h)); let footer = self.footer.as_ref().map(|f| format_comment(f)); - let groups = self.groups.iter().map(|u| u.to_string()); + let groups = self.groups.iter().map(ToString::to_string); let groups = groups.collect::>().join("\n\n"); let result = [header, Some(groups), footer]; - let result = result.iter().filter_map(|u| u.clone()); + let result = result.iter().filter_map(Clone::clone); let result = result.collect::>().join("\n\n"); write!(f, "{}", result.as_str()) } diff --git a/exclusion/build/split.rs b/robotxt/build/split.rs similarity index 92% rename from exclusion/build/split.rs rename to robotxt/build/split.rs index 0bc57ee..d2b5e27 100644 --- a/exclusion/build/split.rs +++ b/robotxt/build/split.rs @@ -1,7 +1,7 @@ /// Splits multiline comments into lines and prefixes them with `#`. pub fn format_comment(txt: &str) -> String { txt.lines() - .map(|txt| txt.trim()) + .map(str::trim) .filter(|txt| !txt.is_empty()) .map(|txt| { if txt.starts_with('#') { diff --git a/exclusion/lib.rs b/robotxt/lib.rs similarity index 73% rename from exclusion/lib.rs rename to robotxt/lib.rs index f73fd97..cf58df9 100644 --- a/exclusion/lib.rs +++ b/robotxt/lib.rs @@ -1,6 +1,7 @@ #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("./README.md")] +#![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)] // Re-exports pub use url; @@ -22,13 +23,19 @@ pub enum Error { #[error("cannot be a base url")] CannotBeBase, + /// Unable to create the expected path to the `robots.txt` file: + /// does not have a host. + #[error("does not have a host")] + NoHost, + /// Unable to create the expected path to the `robots.txt` file: /// unexpected address scheme, expected `http` or `https`. - #[error("addr scheme: `{scheme}`, expected `http` or `https`")] + #[error("scheme: `{scheme}`, expected `http` or `https`")] WrongScheme { scheme: String }, /// Unable to create the expected path to the `robots.txt` file: /// unexpected parsing error. + // TODO: Remove url::ParseError. #[error("url parsing error: {0}")] Url(#[from] url::ParseError), } @@ -37,7 +44,7 @@ pub enum Error { /// /// [`Result`]: std::result::Result /// [`robotxt`]: crate -pub type Result = std::result::Result; +pub type Result = std::result::Result; mod paths; @@ -52,9 +59,9 @@ mod parse; #[doc(hidden)] pub mod prelude { #[cfg(feature = "builder")] - pub use super::build::*; + pub use super::build::{GroupBuilder, RobotsBuilder}; #[cfg(feature = "parser")] - pub use super::parse::*; - pub use super::paths::*; + pub use super::parse::{AccessResult, Robots, ALL_UAS}; + pub use super::paths::{create_url, BYTE_LIMIT}; pub use super::{Error, Result}; } diff --git a/exclusion/parse/access.rs b/robotxt/parse/access.rs similarity index 87% rename from exclusion/parse/access.rs rename to robotxt/parse/access.rs index d95e999..0fe8683 100644 --- a/exclusion/parse/access.rs +++ b/robotxt/parse/access.rs @@ -1,16 +1,20 @@ +use std::fmt; +use std::ops::Deref; + /// The result of the `robots.txt` retrieval attempt. /// /// See [`Robots::from_access`]. /// Also see 2.3.1. Access Results in the specification. /// /// [`Robots::from_access`]: crate::Robots::from_access -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum AccessResult<'a> { /// 2.3.1.1. Successful Access /// /// If the crawler successfully downloads the robots.txt file, the /// crawler MUST follow the parseable rules. Successful(&'a [u8]), + /// 2.3.1.2. Redirects /// /// It's possible that a server responds to a robots.txt fetch request @@ -25,6 +29,7 @@ pub enum AccessResult<'a> { /// If there are more than five consecutive redirects, crawlers MAY /// assume that the robots.txt file is unavailable. Redirect, + /// 2.3.1.3. "Unavailable" Status /// /// "Unavailable" means the crawler tries to fetch the robots.txt file @@ -36,6 +41,7 @@ pub enum AccessResult<'a> { /// unavailable to the crawler, then the crawler MAY access any resources /// on the server. Unavailable, + /// 2.3.1.4. "Unreachable" Status /// /// If the robots.txt file is unreachable due to server or network @@ -52,7 +58,8 @@ pub enum AccessResult<'a> { impl AccessResult<'_> { /// Returns the textual representation of a status. - pub fn as_str(&self) -> &'static str { + #[must_use] + pub const fn as_str(&self) -> &'static str { match self { AccessResult::Successful(_) => "Successful", AccessResult::Redirect => "Redirect", @@ -62,8 +69,18 @@ impl AccessResult<'_> { } } -impl std::fmt::Display for AccessResult<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl Deref for AccessResult<'_> { + type Target = str; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +impl fmt::Display for AccessResult<'_> { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } } diff --git a/exclusion/parse/inner.rs b/robotxt/parse/inner.rs similarity index 93% rename from exclusion/parse/inner.rs rename to robotxt/parse/inner.rs index 6bbf2ae..9bf65af 100644 --- a/exclusion/parse/inner.rs +++ b/robotxt/parse/inner.rs @@ -11,16 +11,17 @@ use crate::parse::rule::Rule; use crate::paths::normalize_path; use crate::BYTE_LIMIT; -/// The [`Rules`] enum determines if the [RobotsInner::is_allowed] results +/// The [`Rules`] enum determines if the [`RobotsInner::is_allowed`] results /// from the set of [`Rule`]s or the single provided global rule. #[derive(Debug, Clone, PartialEq, Eq)] -pub(crate) enum Rules { +pub enum Rules { Rules(Vec), Always(bool), } /// The [`RobotsInner`] struct provides convenient and efficient storage for -/// the data associated with certain user-agent for further matching. +/// the data associated with a specific `user-agent` for further matching. +#[must_use] #[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct RobotsInner { @@ -63,11 +64,11 @@ impl RobotsInner { // TODO: Remove overlapping rules. #[cfg(feature = "optimal")] - if rules.is_empty() || rules.iter().all(|r| r.is_allowed()) { + if rules.is_empty() || rules.iter().all(Rule::is_allowed) { // Empty or all allow. return Rules::Always(true); } else if rules.iter().all(|r| !r.is_allowed()) - && rules.iter().rev().any(|r| r.is_universal()) + && rules.iter().rev().any(Rule::is_universal) { // All disallow + universal disallow. // Universal rule should be one of the smallest, so reverse the iter. @@ -89,6 +90,7 @@ impl RobotsInner { /// Returns `Some(true)` if there is an explicit `allow` or the global rule. /// NOTE: Expects relative path. + #[must_use] pub fn try_is_allowed(&self, path: &str) -> Option { match self.rules { Rules::Always(always) => Some(always), @@ -97,20 +99,22 @@ impl RobotsInner { path => rules .iter() .find(|r| r.is_match(path)) - .map(|rule| rule.is_allowed()), + .map(Rule::is_allowed), }, } } /// Returns true if the relative path is allowed for this set of rules. /// NOTE: Expects relative path. + #[must_use] pub fn is_allowed(&self, path: &str) -> bool { // Returns true is there is no rule matching the path. self.try_is_allowed(path).unwrap_or(true) } /// Returns `Some(_)` if the rules fully allow or disallow. - pub fn is_always(&self) -> Option { + #[must_use] + pub const fn is_always(&self) -> Option { match &self.rules { Rules::Rules(_) => None, Rules::Always(always) => Some(*always), @@ -118,22 +122,26 @@ impl RobotsInner { } /// Returns the longest matching user-agent. + #[must_use] pub fn user_agent(&self) -> &str { self.user_agent.as_ref() } /// Returns the specified crawl-delay. - pub fn crawl_delay(&self) -> Option { + #[must_use] + pub const fn crawl_delay(&self) -> Option { self.crawl_delay } /// Returns all collected sitemaps. + #[must_use] pub fn sitemaps(&self) -> &[Url] { self.sitemaps.as_slice() } /// Returns the total amount of applied rules unless constructed /// with (or optimized to) the global rule. + #[must_use] pub fn len(&self) -> Option { match &self.rules { Rules::Rules(vec) => Some(vec.len()), @@ -143,6 +151,7 @@ impl RobotsInner { /// Returns true if there are no applied rules i.e. it is constructed /// with (or optimized to) the global rule. + #[must_use] pub fn is_empty(&self) -> Option { self.len().map(|len| len == 0) } @@ -151,9 +160,10 @@ impl RobotsInner { #[cfg(test)] #[cfg(feature = "optimal")] mod optimal_output { - use super::*; use crate::ALL_UAS; + use super::*; + #[test] fn from() { let r = RobotsInner::from_always(true, None, "foo"); @@ -192,9 +202,10 @@ mod optimal_output { #[cfg(test)] mod precedence_rules { - use super::*; use crate::ALL_UAS; + use super::*; + #[test] fn simple() { let t = b"Allow: /p \n Disallow: /"; diff --git a/exclusion/parse/lexer.rs b/robotxt/parse/lexer.rs similarity index 98% rename from exclusion/parse/lexer.rs rename to robotxt/parse/lexer.rs index 7cf8e03..f8393e8 100644 --- a/exclusion/parse/lexer.rs +++ b/robotxt/parse/lexer.rs @@ -42,13 +42,13 @@ const NEWLINE: u8 = b'\n'; const COMMENT: u8 = b'#'; /// Returns true if the character code is neither a newline nor a carriage return. -fn not_line_ending(c: u8) -> bool { +const fn not_line_ending(c: u8) -> bool { c != NEWLINE && c != CARRIAGE } /// Returns true if the character code is neither a newline, a carriage return, /// nor a comment character. -fn not_line_ending_or_comment(c: u8) -> bool { +const fn not_line_ending_or_comment(c: u8) -> bool { c != NEWLINE && c != CARRIAGE && c != COMMENT } diff --git a/exclusion/parse/mod.rs b/robotxt/parse/mod.rs similarity index 95% rename from exclusion/parse/mod.rs rename to robotxt/parse/mod.rs index 1124186..013339d 100644 --- a/exclusion/parse/mod.rs +++ b/robotxt/parse/mod.rs @@ -1,20 +1,22 @@ -use std::io::{BufReader, Read}; +use std::io::{self, BufReader, Read}; use std::sync::Arc; +use std::time::Duration; +#[cfg(feature = "serde")] +use ::serde::{Deserialize, Serialize}; use url::Url; -use crate::BYTE_LIMIT; pub use access::AccessResult; use inner::RobotsInner; +use crate::BYTE_LIMIT; + mod access; mod inner; mod lexer; mod parser; mod rule; -#[cfg(feature = "serde")] -use ::serde::{Deserialize, Serialize}; #[cfg(feature = "serde")] mod serde; @@ -62,6 +64,7 @@ pub const ALL_UAS: &str = "*"; /// assert!(!r.is_relative_allowed("/example/nope.txt")); /// assert!(!r.is_relative_allowed("/invalid/path.txt")); /// ``` +#[must_use] #[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Robots { @@ -87,6 +90,7 @@ impl Robots { /// assert!(!r.is_relative_allowed("/example/nope.txt")); /// assert!(!r.is_relative_allowed("/invalid/path.txt")); /// ``` + pub fn from_bytes(robots: &[u8], user_agent: &str) -> Self { let inner = RobotsInner::from_bytes(robots, user_agent); Self { @@ -113,7 +117,7 @@ impl Robots { /// assert!(!r.is_relative_allowed("/example/nope.txt")); /// assert!(!r.is_relative_allowed("/invalid/path.txt")); /// ``` - pub fn from_reader(reader: R, user_agent: &str) -> Result { + pub fn from_reader(reader: R, user_agent: &str) -> io::Result { let reader = reader.take(BYTE_LIMIT as u64); let mut reader = BufReader::new(reader); @@ -170,6 +174,7 @@ impl Robots { /// See [`RobotsBuilder::new`]. /// /// [`RobotsBuilder::new`]: crate::RobotsBuilder::new + #[inline] #[cfg(feature = "builder")] #[cfg_attr(docsrs, doc(cfg(feature = "builder")))] pub fn builder() -> crate::RobotsBuilder { @@ -195,6 +200,8 @@ impl Robots { /// assert_eq!(r.try_is_relative_allowed("/example/nope.txt"), Some(false)); /// assert_eq!(r.try_is_relative_allowed("/invalid/path.txt"), None); /// ``` + #[inline] + #[must_use] pub fn try_is_relative_allowed(&self, addr: &str) -> Option { self.inner.try_is_allowed(addr) } @@ -217,6 +224,8 @@ impl Robots { /// assert!(!r.is_relative_allowed("/example/nope.txt")); /// assert!(!r.is_relative_allowed("/invalid/path.txt")); /// ``` + #[inline] + #[must_use] pub fn is_relative_allowed(&self, addr: &str) -> bool { self.inner.is_allowed(addr) } @@ -240,18 +249,16 @@ impl Robots { /// assert_eq!(r.try_is_absolute_allowed(&base.join("/example/nope.txt").unwrap()), Some(false)); /// assert_eq!(r.try_is_absolute_allowed(&base.join("/invalid/path.txt").unwrap()), None); /// ``` + #[must_use] pub fn try_is_absolute_allowed(&self, addr: &Url) -> Option { let path = addr.path().to_owned(); - let query = addr - .query() - .map(|u| "?".to_owned() + u) - .unwrap_or("".to_owned()); + let query = addr.query().map(|u| "?".to_owned() + u).unwrap_or_default(); let frag = addr .fragment() .map(|u| "#".to_owned() + u) - .unwrap_or("".to_owned()); + .unwrap_or_default(); let relative = path + &query + &frag; self.inner.try_is_allowed(&relative) @@ -277,6 +284,7 @@ impl Robots { /// assert!(!r.is_absolute_allowed(&base.join("/example/nope.txt").unwrap())); /// assert!(!r.is_absolute_allowed(&base.join("/invalid/path.txt").unwrap())); /// ``` + #[must_use] pub fn is_absolute_allowed(&self, addr: &Url) -> bool { self.try_is_absolute_allowed(addr).unwrap_or(true) } @@ -292,6 +300,8 @@ impl Robots { /// let r = Robots::from_always(false, "foobot"); /// assert_eq!(r.is_always(), Some(false)); /// ``` + #[inline] + #[must_use] pub fn is_always(&self) -> Option { self.inner.is_always() } @@ -310,6 +320,8 @@ impl Robots { /// let r = Robots::from_bytes(txt, "foobot-search"); /// assert_eq!(r.user_agent(), "foobot"); /// ``` + #[inline] + #[must_use] pub fn user_agent(&self) -> &str { self.inner.user_agent() } @@ -328,7 +340,9 @@ impl Robots { /// let r = Robots::from_bytes(txt, "foobot"); /// assert_eq!(r.crawl_delay(), Some(Duration::from_secs(5))); /// ``` - pub fn crawl_delay(&self) -> Option { + #[inline] + #[must_use] + pub fn crawl_delay(&self) -> Option { self.inner.crawl_delay() } @@ -345,18 +359,24 @@ impl Robots { /// let r = Robots::from_bytes(txt, "foobot"); /// assert_eq!(r.sitemaps().len(), 2); /// ``` + #[inline] + #[must_use] pub fn sitemaps(&self) -> &[Url] { self.inner.sitemaps() } /// Returns the total amount of applied rules unless constructed /// with (or optimized to) the global rule. + #[inline] + #[must_use] pub fn len(&self) -> Option { self.inner.len() } /// Returns true if there are no applied rules i.e. it is constructed /// with (or optimized to) the global rule. + #[inline] + #[must_use] pub fn is_empty(&self) -> Option { self.inner.is_empty() } diff --git a/exclusion/parse/parser.rs b/robotxt/parse/parser.rs similarity index 98% rename from exclusion/parse/parser.rs rename to robotxt/parse/parser.rs index 4e18e2b..d8e56ab 100644 --- a/exclusion/parse/parser.rs +++ b/robotxt/parse/parser.rs @@ -22,8 +22,8 @@ impl Parser { pub fn parse_rules(directives: &[Directive], user_agent: &str) -> Self { let (longest_match, captures_rules) = Self::longest_match(directives, user_agent); let mut state = Self { - longest_match, captures_rules, + longest_match, ..Self::default() }; @@ -60,7 +60,7 @@ impl Parser { // Finds the longest `User-Agent` in the acceptable pool. let selected_ua = acceptable_uas .max_by(|lhs, rhs| lhs.len().cmp(&rhs.len())) - .unwrap_or(ALL_UAS.to_string()); + .unwrap_or_else(|| ALL_UAS.to_string()); // Determines if it should check non-assigned rules. let check_non_assigned = selected_ua == ALL_UAS; diff --git a/exclusion/parse/rule.rs b/robotxt/parse/rule.rs similarity index 98% rename from exclusion/parse/rule.rs rename to robotxt/parse/rule.rs index 176ecb1..1432a8f 100644 --- a/exclusion/parse/rule.rs +++ b/robotxt/parse/rule.rs @@ -176,14 +176,14 @@ impl Rule { /// Returns true if the path matches the pattern. /// NOTE: Expects normalized relative path. pub fn is_match(&self, path: &str) -> bool { - match &self.wildcard { - None => path.starts_with(self.pattern.as_str()), - Some(wildcard) => wildcard.is_match(path), - } + self.wildcard.as_ref().map_or_else( + || path.starts_with(self.pattern.as_str()), + |wildcard| wildcard.is_match(path), + ) } /// Returns true if allowed. - pub fn is_allowed(&self) -> bool { + pub const fn is_allowed(&self) -> bool { self.allow } diff --git a/exclusion/parse/serde.rs b/robotxt/parse/serde.rs similarity index 96% rename from exclusion/parse/serde.rs rename to robotxt/parse/serde.rs index 65adf3a..25734a9 100644 --- a/exclusion/parse/serde.rs +++ b/robotxt/parse/serde.rs @@ -1,17 +1,18 @@ use serde::de::{Error, MapAccess, Visitor}; use serde::ser::SerializeStruct; +use serde::{Deserialize, Serialize}; use serde::{Deserializer, Serializer}; use crate::parse::inner::Rules; use crate::parse::rule::Rule; -impl serde::Serialize for Rules { +impl Serialize for Rules { fn serialize(&self, serializer: S) -> Result where S: Serializer, { match self { - Rules::Rules(rules) => { + Self::Rules(rules) => { let (allow, disallow): (Vec<_>, Vec<_>) = rules.iter().partition(|u| u.is_allowed()); let allow: Vec<_> = allow.iter().map(|u| u.pattern().to_string()).collect(); @@ -22,7 +23,7 @@ impl serde::Serialize for Rules { s.serialize_field("disallow", &disallow)?; s.end() } - Rules::Always(always) => { + Self::Always(always) => { let mut s = serializer.serialize_struct("AlwaysRules", 1)?; s.serialize_field("always", always)?; s.end() @@ -31,7 +32,7 @@ impl serde::Serialize for Rules { } } -impl<'de> serde::Deserialize<'de> for Rules { +impl<'de> Deserialize<'de> for Rules { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, diff --git a/exclusion/paths/create.rs b/robotxt/paths/create.rs similarity index 65% rename from exclusion/paths/create.rs rename to robotxt/paths/create.rs index e7be5e9..600a686 100644 --- a/exclusion/paths/create.rs +++ b/robotxt/paths/create.rs @@ -1,7 +1,16 @@ +use url::Url; + use crate::{Error, Result}; /// Returns the expected path to the `robots.txt` file -/// as the [`url::Url`]. +/// as the `url::`[`Url`]. +/// +/// # Errors +/// +/// Returns the error if the provided [`Url`] cannot be a base, +/// does not have a host or the schema is not `http` or `https`. +/// +/// # Examples /// /// ```rust /// use url::Url; @@ -12,13 +21,17 @@ use crate::{Error, Result}; /// let robots = create_url(&path).unwrap().to_string(); /// assert_eq!(robots, "https://example.com/robots.txt") /// ``` -pub fn create_url(path: &url::Url) -> Result { +pub fn create_url(path: &Url) -> Result { let mut path = path.clone(); if path.cannot_be_a_base() { return Err(Error::CannotBeBase); } + if path.host().is_none() { + return Err(Error::NoHost); + } + if path.scheme() != "http" && path.scheme() != "https" { return Err(Error::WrongScheme { scheme: path.scheme().to_string(), @@ -26,11 +39,12 @@ pub fn create_url(path: &url::Url) -> Result { } if !path.username().is_empty() { - path.set_username("").unwrap(); + path.set_username("").expect("should pass base/host tests"); } if path.password().is_some() { - path.set_password(None).unwrap(); + path.set_password(None) + .expect("should pass base/host tests"); } path.join("/robots.txt").map_err(Into::into) @@ -38,12 +52,12 @@ pub fn create_url(path: &url::Url) -> Result { #[cfg(test)] mod test { - use super::*; + use crate::{create_url, url::Url, Result}; #[test] fn from_url() -> Result<()> { let path = "https://user:pass@example.com/foo/sample.txt"; - let path = url::Url::parse(path).unwrap(); + let path = Url::parse(path).unwrap(); let robots = create_url(&path)?.to_string(); assert_eq!(robots, "https://example.com/robots.txt"); diff --git a/exclusion/paths/mod.rs b/robotxt/paths/mod.rs similarity index 88% rename from exclusion/paths/mod.rs rename to robotxt/paths/mod.rs index 1dbe15b..6f48b94 100644 --- a/exclusion/paths/mod.rs +++ b/robotxt/paths/mod.rs @@ -1,5 +1,5 @@ pub use create::create_url; -pub(crate) use normal::normalize_path; +pub use normal::normalize_path; mod create; mod normal; diff --git a/exclusion/paths/normal.rs b/robotxt/paths/normal.rs similarity index 87% rename from exclusion/paths/normal.rs rename to robotxt/paths/normal.rs index 2a30950..c4ab12d 100644 --- a/exclusion/paths/normal.rs +++ b/robotxt/paths/normal.rs @@ -4,7 +4,7 @@ use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; /// Returns the prefixed & percent-encoded path. /// NOTE: Expects relative path. -pub(crate) fn normalize_path(path: &str) -> String { +pub fn normalize_path(path: &str) -> String { static FRAGMENT: OnceLock = OnceLock::new(); let fragment = FRAGMENT.get_or_init(|| CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>')); let path = utf8_percent_encode(path, fragment).to_string(); @@ -12,9 +12,9 @@ pub(crate) fn normalize_path(path: &str) -> String { // Url::make_relative strips leading and trailing / // https://github.com/servo/rust-url/issues/772 // https://github.com/servo/rust-url/issues/766 - if !path.starts_with('/') { - '/'.to_string() + &path - } else { + if path.starts_with('/') { path + } else { + '/'.to_string() + &path } } diff --git a/inclusion/Cargo.toml b/sitemapo/Cargo.toml similarity index 71% rename from inclusion/Cargo.toml rename to sitemapo/Cargo.toml index ce883f8..44683c0 100644 --- a/inclusion/Cargo.toml +++ b/sitemapo/Cargo.toml @@ -2,21 +2,21 @@ [package] name = "sitemapo" -version = "0.2.0" +version = "0.3.0" readme = "./README.md" edition = { workspace = true } license = { workspace = true } authors = { workspace = true } -repository = "https://github.com/spire-rs/kit/inclusion" -homepage = "https://github.com/spire-rs/kit/inclusion" +repository = "https://github.com/spire-rs/kit" +homepage = "https://github.com/spire-rs/kit" documentation = "https://docs.rs/sitemapo" categories = ["parser-implementations", "web-programming"] -keywords = ["sitemap", "crawler", "parser"] +keywords = ["sitemaps", "sitemap", "inclusion", "crawler", "scraper"] description = """ -The implementation of the Sitemap.xml (or URL inclusion) protocol with -the support of txt & xml formats, and video, image, news extensions. +The implementation of the Sitemap.xml (or URL inclusion) protocol +with the support of txt & xml formats. """ [package.metadata.docs.rs] @@ -42,15 +42,15 @@ extension = ["dep:isolang"] [dependencies] url = { workspace = true } thiserror = { workspace = true } -countio = { version = "0.2.15" } +countio = { version = "0.2" } -quick-xml = { version = "0.31.0" } -bytes = { version = "1.5.0", features = [] } +quick-xml = { version = "0.32" } +bytes = { version = "1.6", features = [] } time = { workspace = true, features = ["parsing", "formatting"] } tokio = { workspace = true, optional = true } async-trait = { workspace = true, optional = true } -isolang = { version = "2.4.0", optional = true, features = [] } +isolang = { version = "2.4", optional = true, features = [] } [dev-dependencies] time = { workspace = true, features = ["macros"] } diff --git a/inclusion/README.md b/sitemapo/README.md similarity index 92% rename from inclusion/README.md rename to sitemapo/README.md index 643ae00..04d61a5 100644 --- a/inclusion/README.md +++ b/sitemapo/README.md @@ -5,8 +5,7 @@ [![Crate Version][crates-badge]][crates-url] [![Crate Coverage][coverage-badge]][coverage-url] -**Also check out other `spire-rs` projects -[here](https://github.com/spire-rs).** +**Check out other `spire` projects [here](https://github.com/spire-rs).** [action-badge]: https://img.shields.io/github/actions/workflow/status/spire-rs/kit/build.yaml?branch=main&label=build&logo=github&style=flat-square [action-url]: https://github.com/spire-rs/kit/actions/workflows/build.yaml @@ -18,8 +17,8 @@ [coverage-url]: https://app.codecov.io/gh/spire-rs/kit The implementation of the Sitemap (or URL inclusion) protocol in the Rust -programming language with the support of `txt` & `xml` formats, and `video`, -`image`, `news` extensions (according to the Google's spec). +programming language with the support of `txt` & `xml` formats (according to the +Google's spec). ### Features diff --git a/inclusion/build/entry.rs b/sitemapo/build/entry.rs similarity index 97% rename from inclusion/build/entry.rs rename to sitemapo/build/entry.rs index 46508a5..13e3fc1 100644 --- a/inclusion/build/entry.rs +++ b/sitemapo/build/entry.rs @@ -4,7 +4,10 @@ use quick_xml::{events, Writer}; use time::format_description::well_known::Iso8601; use crate::build::{Builder, InnerBuilder, CONFIG}; -use crate::record::*; +use crate::record::{ + Entry, BYTE_LIMIT, CHANGE_FREQUENCY, LAST_MODIFIED, LOCATION, PRIORITY, RECORD_LIMIT, URL, + URL_SET, +}; use crate::{Error, Result}; /// Sitemap builder for the versatile XML file with an optional support of extensions. @@ -51,7 +54,7 @@ impl EntryBuilder { } /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerBuilder) -> Self { + pub(crate) const fn from_inner(inner: InnerBuilder) -> Self { Self { inner } } diff --git a/inclusion/build/index.rs b/sitemapo/build/index.rs similarity index 96% rename from inclusion/build/index.rs rename to sitemapo/build/index.rs index 10222b1..88ab77d 100644 --- a/inclusion/build/index.rs +++ b/sitemapo/build/index.rs @@ -4,7 +4,9 @@ use quick_xml::{events, Writer}; use time::format_description::well_known::Iso8601; use crate::build::{Builder, InnerBuilder, CONFIG}; -use crate::record::*; +use crate::record::{ + Index, BYTE_LIMIT, LAST_MODIFIED, LOCATION, RECORD_LIMIT, SITEMAP, SITEMAP_INDEX, +}; use crate::{Error, Result}; /// Sitemap index parser for the versatile XML file. @@ -51,7 +53,7 @@ impl IndexBuilder { } /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerBuilder) -> Self { + pub(crate) const fn from_inner(inner: InnerBuilder) -> Self { Self { inner } } diff --git a/inclusion/build/inner.rs b/sitemapo/build/inner.rs similarity index 97% rename from inclusion/build/inner.rs rename to sitemapo/build/inner.rs index 42dea65..404b591 100644 --- a/inclusion/build/inner.rs +++ b/sitemapo/build/inner.rs @@ -6,7 +6,7 @@ use time::format_description::well_known::iso8601; use crate::Error; -pub(crate) const CONFIG: iso8601::EncodedConfig = iso8601::Config::DEFAULT +pub const CONFIG: iso8601::EncodedConfig = iso8601::Config::DEFAULT .set_time_precision(iso8601::TimePrecision::Second { decimal_digits: NonZeroU8::new(2), }) diff --git a/inclusion/build/mod.rs b/sitemapo/build/mod.rs similarity index 97% rename from inclusion/build/mod.rs rename to sitemapo/build/mod.rs index 9a60e67..7c269bb 100644 --- a/inclusion/build/mod.rs +++ b/sitemapo/build/mod.rs @@ -1,15 +1,13 @@ -mod auto; -mod entry; -mod index; -mod inner; -mod plain; - -pub use auto::*; pub use entry::*; pub use index::*; pub(crate) use inner::*; pub use plain::*; +mod entry; +mod index; +mod inner; +mod plain; + // TODO: Make builders take BufWrite. /// Core trait for the builder implementation. diff --git a/inclusion/build/plain.rs b/sitemapo/build/plain.rs similarity index 99% rename from inclusion/build/plain.rs rename to sitemapo/build/plain.rs index 1b82a75..eb6efd3 100644 --- a/inclusion/build/plain.rs +++ b/sitemapo/build/plain.rs @@ -4,7 +4,7 @@ use countio::Counter; use url::Url; use crate::build::Builder; -use crate::record::*; +use crate::record::{BYTE_LIMIT, RECORD_LIMIT}; use crate::{Error, Result}; /// Sitemap builder for the simple TXT file that contains one URL per line. @@ -144,6 +144,7 @@ mod tokio { #[cfg(test)] mod test { use std::io::BufWriter; + use url::Url; use crate::build::{Builder, PlainBuilder}; diff --git a/inclusion/lib.rs b/sitemapo/lib.rs similarity index 95% rename from inclusion/lib.rs rename to sitemapo/lib.rs index dc7a00b..89a3402 100644 --- a/inclusion/lib.rs +++ b/sitemapo/lib.rs @@ -1,6 +1,10 @@ #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("./README.md")] +#![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)] + +// Re-exports +pub use url; /// Unrecoverable failure during `sitemap.xml` building or parsing. /// @@ -38,9 +42,6 @@ pub enum Error { /// [`sitemapo`]: crate pub type Result = std::result::Result; -// Re-exports -pub use url; - /// Builder types: `AutoBuilder`, `TxtBuilder` & `XmlBuilder`. pub mod build; /// Parser types: `AutoParser`, `TxtParser` & `XmlParser`. @@ -50,9 +51,8 @@ pub mod record; #[doc(hidden)] pub mod prelude { - pub use super::{Error, Result}; - pub use super::build::*; pub use super::parse::*; pub use super::record::*; + pub use super::{Error, Result}; } diff --git a/inclusion/parse/auto.rs b/sitemapo/parse/auto.rs similarity index 96% rename from inclusion/parse/auto.rs rename to sitemapo/parse/auto.rs index 3e3f9a2..e602fff 100644 --- a/inclusion/parse/auto.rs +++ b/sitemapo/parse/auto.rs @@ -3,7 +3,11 @@ use countio::Counter; use quick_xml::{events, Reader}; use url::Url; -use crate::{parse::*, record::*, Error}; +use crate::{ + parse::{try_if_readable, EntryParser, IndexParser, InnerParser, Parser, PlainParser}, + record::{Entry, SITEMAP_INDEX, URL_SET}, + Error, +}; /// Sitemap type resolver. // TODO: Check for the plain txt sitemaps. @@ -147,9 +151,9 @@ impl AutoParser { /// Returns minimal (no resolved indexes) total sitemaps amount. pub fn len(&self) -> usize { self.sitemaps.len() - + self.plain.is_some() as usize - + self.index.is_some() as usize - + self.entry.is_some() as usize + + usize::from(self.plain.is_some()) + + usize::from(self.index.is_some()) + + usize::from(self.entry.is_some()) } } @@ -199,7 +203,7 @@ where if let Some(sitemap) = self.sitemaps.pop() { let reader = (fetcher)(sitemap)?; if let Ok(sitemap) = Scanner::from_sync(reader) { - self.replace_parser(sitemap) + self.replace_parser(sitemap); } } diff --git a/inclusion/parse/entry.rs b/sitemapo/parse/entry.rs similarity index 95% rename from inclusion/parse/entry.rs rename to sitemapo/parse/entry.rs index 35ebdf3..249a3e6 100644 --- a/inclusion/parse/entry.rs +++ b/sitemapo/parse/entry.rs @@ -8,7 +8,7 @@ use crate::{Error, Result}; /// [`Entry`] builder. #[derive(Debug, Clone, Default)] -pub(crate) struct EntryFactory { +pub struct EntryFactory { location: Option, modified: Option, priority: Option, @@ -78,7 +78,7 @@ impl EntryParser { } /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerParser) -> Self { + pub(crate) const fn from_inner(inner: InnerParser) -> Self { Self { inner } } @@ -114,17 +114,17 @@ impl EntryParser { } } - pub(crate) fn write_event(&mut self, event: events::Event) -> Result> { + pub(crate) fn write_event(&mut self, event: events::Event) -> Output { let tag = URL.as_bytes(); let builder = self.inner.write_event(event, tag, Self::apply_inner); if let Ok(Output::Some(r)) = builder { if let Some(record) = r.build() { - return Ok(Output::Some(record)); + return Output::Some(record); } } - Ok(Output::None) + Output::None } } @@ -149,7 +149,7 @@ impl Parser for EntryParser { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into(&mut buf)?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), @@ -185,7 +185,7 @@ mod async_parser { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into_async(&mut buf).await?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), diff --git a/inclusion/parse/index.rs b/sitemapo/parse/index.rs similarity index 93% rename from inclusion/parse/index.rs rename to sitemapo/parse/index.rs index eb05574..77bf4d4 100644 --- a/inclusion/parse/index.rs +++ b/sitemapo/parse/index.rs @@ -8,7 +8,7 @@ use crate::{Error, Result}; /// [`Index`] builder. #[derive(Debug, Clone, Default)] -pub(crate) struct IndexFactory { +pub struct IndexFactory { pub(crate) location: Option, pub(crate) modified: Option, } @@ -53,7 +53,7 @@ impl IndexParser { } /// Creates a new instance with the given inner parser. - pub(crate) fn from_inner(inner: InnerParser) -> Self { + pub(crate) const fn from_inner(inner: InnerParser) -> Self { Self { inner } } @@ -85,17 +85,17 @@ impl IndexParser { } } - pub(crate) fn write_event(&mut self, event: events::Event) -> Result> { + pub(crate) fn write_event(&mut self, event: events::Event) -> Output { let tag = SITEMAP.as_bytes(); let builder = self.inner.write_event(event, tag, Self::apply_inner); if let Ok(Output::Some(r)) = builder { if let Some(record) = r.build() { - return Ok(Output::Some(record)); + return Output::Some(record); } } - Ok(Output::None) + Output::None } } @@ -120,7 +120,7 @@ impl Parser for IndexParser { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into(&mut buf)?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), @@ -156,7 +156,7 @@ mod tokio { loop { self.inner.try_if_readable()?; let event = self.inner.reader.read_event_into_async(&mut buf).await?; - match self.write_event(event)? { + match self.write_event(event) { Output::Some(record) => return Ok(Some(record)), Output::None => {} Output::End => return Ok(None), diff --git a/inclusion/parse/inner.rs b/sitemapo/parse/inner.rs similarity index 95% rename from inclusion/parse/inner.rs rename to sitemapo/parse/inner.rs index dd0c5d0..851a59f 100644 --- a/inclusion/parse/inner.rs +++ b/sitemapo/parse/inner.rs @@ -5,7 +5,7 @@ use quick_xml::{events::Event, Reader}; use crate::parse::try_if_readable; use crate::Result; -pub(crate) enum Output { +pub enum Output { /// Next record. Some(T), /// The event didn't result into the new record. @@ -16,11 +16,11 @@ pub(crate) enum Output { impl From> for Output { fn from(value: Option) -> Self { - value.map(Output::Some).unwrap_or(Output::End) + value.map_or(Self::End, Self::Some) } } -pub(crate) struct InnerParser { +pub struct InnerParser { pub(crate) record: Option, pub(crate) reader: Reader>, pub(crate) records: usize, @@ -113,6 +113,6 @@ impl std::fmt::Debug for InnerParser { f.debug_struct("InnerParser") .field("bytes", &self.reader.get_ref().reader_bytes()) .field("records", &self.records) - .finish() + .finish_non_exhaustive() } } diff --git a/inclusion/parse/mod.rs b/sitemapo/parse/mod.rs similarity index 94% rename from inclusion/parse/mod.rs rename to sitemapo/parse/mod.rs index 72057b3..250e030 100644 --- a/inclusion/parse/mod.rs +++ b/sitemapo/parse/mod.rs @@ -1,15 +1,15 @@ -mod auto; -mod entry; -mod index; -mod inner; -mod plain; - pub use auto::*; pub use entry::*; pub use index::*; pub(crate) use inner::*; pub use plain::*; +mod auto; +mod entry; +mod index; +mod inner; +mod plain; + /// Core trait for the parser implementation. pub trait Parser: Sized { type Error: std::error::Error; @@ -41,7 +41,7 @@ pub trait AsyncParser: Sized { async fn close(self) -> Result; } -pub(crate) fn try_if_readable(records: usize, bytes: usize) -> crate::Result<()> { +pub(crate) const fn try_if_readable(records: usize, bytes: usize) -> crate::Result<()> { use crate::record::{BYTE_LIMIT, RECORD_LIMIT}; if records + 1 > RECORD_LIMIT { diff --git a/inclusion/parse/plain.rs b/sitemapo/parse/plain.rs similarity index 100% rename from inclusion/parse/plain.rs rename to sitemapo/parse/plain.rs diff --git a/inclusion/record/entry.rs b/sitemapo/record/entry.rs similarity index 81% rename from inclusion/record/entry.rs rename to sitemapo/record/entry.rs index 0fb48d4..0ae6d4d 100644 --- a/inclusion/record/entry.rs +++ b/sitemapo/record/entry.rs @@ -15,6 +15,7 @@ use crate::record::{Frequency, Priority}; /// .with_priority(Priority::MAX) /// .with_frequency(Frequency::Daily); /// ``` +#[must_use] #[derive(Debug, Clone)] pub struct Entry { pub location: Url, @@ -25,7 +26,7 @@ pub struct Entry { impl Entry { /// Creates a new instance with the given location. - pub fn new(location: Url) -> Self { + pub const fn new(location: Url) -> Self { Self { location, modified: None, @@ -35,19 +36,19 @@ impl Entry { } /// Creates a new record with the given modify timestamp. - pub fn with_modified(mut self, modified: OffsetDateTime) -> Self { + pub const fn with_modified(mut self, modified: OffsetDateTime) -> Self { self.modified = Some(modified); self } /// Creates a new record with the given priority. - pub fn with_priority(mut self, priority: Priority) -> Self { + pub const fn with_priority(mut self, priority: Priority) -> Self { self.priority = Some(priority); self } /// Creates a new record with the given change frequency. - pub fn with_frequency(mut self, frequency: Frequency) -> Self { + pub const fn with_frequency(mut self, frequency: Frequency) -> Self { self.frequency = Some(frequency); self } @@ -55,6 +56,6 @@ impl Entry { impl From for Entry { fn from(location: Url) -> Self { - Entry::new(location) + Self::new(location) } } diff --git a/inclusion/record/frequency.rs b/sitemapo/record/frequency.rs similarity index 78% rename from inclusion/record/frequency.rs rename to sitemapo/record/frequency.rs index 649a56c..924ead2 100644 --- a/inclusion/record/frequency.rs +++ b/sitemapo/record/frequency.rs @@ -12,6 +12,7 @@ pub struct FrequencyError; /// /// This value provides general information to search engines and /// may not correlate exactly to how often they crawl the page. +#[must_use] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Frequency { /// Describes documents that change each time they are accessed. @@ -40,17 +41,14 @@ impl Frequency { /// assert_eq!(frequency.unwrap(), Frequency::Daily); /// ``` pub fn parse(frequency: &str) -> Result { - let frequency = frequency.trim().to_lowercase(); - - use Frequency::*; - match frequency.as_str() { - "always" => Ok(Always), - "hourly" => Ok(Hourly), - "daily" => Ok(Daily), - "weekly" => Ok(Weekly), - "monthly" => Ok(Monthly), - "yearly" => Ok(Yearly), - "never" => Ok(Never), + match frequency.trim().to_lowercase().as_str() { + "always" => Ok(Self::Always), + "hourly" => Ok(Self::Hourly), + "daily" => Ok(Self::Daily), + "weekly" => Ok(Self::Weekly), + "monthly" => Ok(Self::Monthly), + "yearly" => Ok(Self::Yearly), + "never" => Ok(Self::Never), _ => Err(FrequencyError), } } @@ -66,15 +64,15 @@ impl Frequency { /// let rs = Frequency::Monthly.next_date(d0); /// assert_eq!(rs.unwrap(), datetime!(2022-10-12 12:00 UTC)) /// ``` + #[must_use] pub fn next_date(&self, date: OffsetDateTime) -> Option { - use Frequency::*; match &self { - Always | Never => None, - Hourly => Some(date + 1.hours()), - Daily => Some(date + 1.days()), - Weekly => Some(date + 7.days()), - Monthly => Some(date + 30.days()), - Yearly => Some(date + 365.days()), + Self::Always | Self::Never => None, + Self::Hourly => Some(date + 1.hours()), + Self::Daily => Some(date + 1.days()), + Self::Weekly => Some(date + 7.days()), + Self::Monthly => Some(date + 30.days()), + Self::Yearly => Some(date + 365.days()), } } @@ -88,14 +86,14 @@ impl Frequency { /// let d1 = datetime!(2022-10-12 12:00 UTC); /// assert!(Frequency::Monthly.is_outdated(d0, d1)); /// ``` + #[must_use] pub fn is_outdated(&self, date: OffsetDateTime, now: OffsetDateTime) -> bool { match &self { Self::Always => true, Self::Never => false, - _ => match self.next_date(date) { - Some(next) => next <= now, - None => unreachable!(), - }, + _ => self + .next_date(date) + .map_or_else(|| unreachable!(), |x| x <= now), } } } diff --git a/inclusion/record/index.rs b/sitemapo/record/index.rs similarity index 77% rename from inclusion/record/index.rs rename to sitemapo/record/index.rs index bbfa216..b54812b 100644 --- a/inclusion/record/index.rs +++ b/sitemapo/record/index.rs @@ -11,6 +11,7 @@ use url::Url; /// let _ = Index::new(Url::parse("https://example.com/").unwrap()) /// .with_modified(datetime!(2020-01-01 0:00 UTC)); /// ``` +#[must_use] #[derive(Debug, Clone)] pub struct Index { pub location: Url, @@ -19,7 +20,7 @@ pub struct Index { impl Index { /// Creates a new record with the given location. - pub fn new(location: Url) -> Self { + pub const fn new(location: Url) -> Self { Self { location, modified: None, @@ -27,16 +28,14 @@ impl Index { } /// Creates a new record with the given modify timestamp. - pub fn with_modified(self, modified: OffsetDateTime) -> Self { - Self { - modified: Some(modified), - ..self - } + pub const fn with_modified(mut self, modified: OffsetDateTime) -> Self { + self.modified = Some(modified); + self } } impl From for Index { fn from(location: Url) -> Self { - Index::new(location) + Self::new(location) } } diff --git a/inclusion/record/mod.rs b/sitemapo/record/mod.rs similarity index 100% rename from inclusion/record/mod.rs rename to sitemapo/record/mod.rs diff --git a/inclusion/record/priority.rs b/sitemapo/record/priority.rs similarity index 95% rename from inclusion/record/priority.rs rename to sitemapo/record/priority.rs index f0aef52..dbbd5eb 100644 --- a/inclusion/record/priority.rs +++ b/sitemapo/record/priority.rs @@ -17,6 +17,7 @@ pub enum PriorityError { /// Valid values range from 0.0 to 1.0. This value does not affect how your /// pages are compared to pages on other sites. It only lets the search engines /// know which pages you deem most important for the crawlers. +#[must_use] #[derive(Debug, Clone, Copy, PartialEq)] pub struct Priority(f32); @@ -46,7 +47,7 @@ impl Priority { /// assert_eq!(frequency.as_inner(), 1.0); /// ``` pub fn new_fallback(priority: f32) -> Self { - Self(priority.max(0.0).min(1.0)) + Self(priority.clamp(0.0, 1.0)) } /// Tries to parse the string into the valid priority value. @@ -63,7 +64,8 @@ impl Priority { } /// Returns the internal value. - pub fn as_inner(&self) -> f32 { + #[must_use] + pub const fn as_inner(&self) -> f32 { self.0 }