From fc815e68bf14a838ad6f006bb25c0dfe40da2e91 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 8 Dec 2022 23:16:38 +0100 Subject: [PATCH 01/17] Providing status codes (fixes #161) --- src/UsesGoutte.php | 100 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 5f9be19..c681948 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -133,4 +133,104 @@ public function clickLink($titleOrUrl): self return $this; } + + public function statusCode(): ?int + { + return $this->client->getResponse()?->getStatusCode(); + } + + public function is2xx(): bool + { + return $this->statusCode() >= 200 && $this->statusCode() <= 299; + } + + public function is3xx(): bool + { + return $this->statusCode() >= 300 && $this->statusCode() <= 399; + } + + public function is4xx(): bool + { + return $this->statusCode() >= 400 && $this->statusCode() <= 499; + } + + public function is5xx(): bool + { + return $this->statusCode() >= 500 && $this->statusCode() <= 599; + } + + public function is200(): bool + { + return $this->statusCode() === 200; + } + + public function is301(): bool + { + return $this->statusCode() === 301; + } + + public function is302(): bool + { + return $this->statusCode() === 302; + } + + public function is400(): bool + { + return $this->statusCode() === 400; + } + + public function is401(): bool + { + return $this->statusCode() === 401; + } + + public function is402(): bool + { + return $this->statusCode() === 402; + } + + public function is403(): bool + { + return $this->statusCode() === 403; + } + + public function is404(): bool + { + return $this->statusCode() === 404; + } + + public function is500(): bool + { + return $this->statusCode() === 500; + } + + public function isOk(): bool + { + return $this->is200(); + } + + public function isUnauthorized(): bool + { + return $this->is401(); + } + + public function isForbidden(): bool + { + return $this->is403(); + } + + public function isNotFound(): bool + { + return $this->is404(); + } + + public function isServerError(): bool + { + return $this->is500(); + } + + public function isInternalServerError(): bool + { + return $this->is500(); + } } \ No newline at end of file From 5cb469cbbd7b086d6cad24429882484f1f07b940 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 8 Dec 2022 23:21:10 +0100 Subject: [PATCH 02/17] Hardening call --- src/UsesGoutte.php | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index c681948..1c5f2f1 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -134,9 +134,13 @@ public function clickLink($titleOrUrl): self return $this; } - public function statusCode(): ?int + public function statusCode(): int { - return $this->client->getResponse()?->getStatusCode(); + if ($this->client->getResponse() === null) { + throw new \Exception('You can not access the status code before your first navigation using `go`.'); + } + + return $this->client->getResponse()->getStatusCode(); } public function is2xx(): bool From c17eeeb371985b51e6822828efae8117e4f2c4b7 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 8 Dec 2022 23:29:15 +0100 Subject: [PATCH 03/17] Adding tests --- src/UsesGoutte.php | 2 +- tests/StatusCodeTest.php | 53 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 tests/StatusCodeTest.php diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 1c5f2f1..8e94bd5 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -136,7 +136,7 @@ public function clickLink($titleOrUrl): self public function statusCode(): int { - if ($this->client->getResponse() === null) { + if ($this->currentPage === null) { throw new \Exception('You can not access the status code before your first navigation using `go`.'); } diff --git a/tests/StatusCodeTest.php b/tests/StatusCodeTest.php new file mode 100644 index 0000000..3389f29 --- /dev/null +++ b/tests/StatusCodeTest.php @@ -0,0 +1,53 @@ +expectException(\Exception::class); + $this->expectExceptionMessage('You can not access the status code before your first navigation using `go`.'); + + $web->statusCode; + } + + /** + * @test + */ + public function testOk() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page: This redirects to phpscraper.de + $web->go('https://phpscraper.de'); + + // Check the status itself. + $this->assertSame(200, $web->statusCode); + + // Check the detailed states. + $this->assertTrue($web->is2xx); + $this->assertFalse($web->is4xx); + $this->assertFalse($web->is5xx); + $this->assertTrue($web->is200); + $this->assertFalse($web->is400); + $this->assertFalse($web->is401); + $this->assertFalse($web->is402); + $this->assertFalse($web->is403); + $this->assertFalse($web->is404); + $this->assertFalse($web->is500); + + // Assert access-helpers + $this->assertTrue($web->isOk); + $this->assertFalse($web->isUnauthorized); + $this->assertFalse($web->isForbidden); + $this->assertFalse($web->isNotFound); + $this->assertFalse($web->isServerError); + $this->assertFalse($web->isInternalServerError); + } +} From 5481b55d4673e0d4203cbd3dc1f173f1524c59fa Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 8 Dec 2022 23:38:51 +0100 Subject: [PATCH 04/17] Adding further tests for status code --- tests/NotFoundTest.php | 22 ---------------------- tests/StatusCodeTest.php | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 22 deletions(-) delete mode 100644 tests/NotFoundTest.php diff --git a/tests/NotFoundTest.php b/tests/NotFoundTest.php deleted file mode 100644 index 1bd1ce1..0000000 --- a/tests/NotFoundTest.php +++ /dev/null @@ -1,22 +0,0 @@ -go('https://test-pages.phpscraper.de/page-does-not-exist.html'); - - // The built-in server returns this string. - $this->assertSame('Page Not Found', $web->title); - } -} diff --git a/tests/StatusCodeTest.php b/tests/StatusCodeTest.php index 3389f29..768bdf7 100644 --- a/tests/StatusCodeTest.php +++ b/tests/StatusCodeTest.php @@ -50,4 +50,38 @@ public function testOk() $this->assertFalse($web->isServerError); $this->assertFalse($web->isInternalServerError); } + + /** + * @test + */ + public function testNotFound() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page: This redirects to phpscraper.de + $web->go('https://test-pages.phpscraper.de/page-does-not-exist.html'); + + // Check the status itself. + $this->assertSame(404, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->is2xx); + $this->assertTrue($web->is4xx); + $this->assertFalse($web->is5xx); + $this->assertFalse($web->is200); + $this->assertFalse($web->is400); + $this->assertFalse($web->is401); + $this->assertFalse($web->is402); + $this->assertFalse($web->is403); + $this->assertTrue($web->is404); + $this->assertFalse($web->is500); + + // Assert access-helpers + $this->assertFalse($web->isOk); + $this->assertFalse($web->isUnauthorized); + $this->assertFalse($web->isForbidden); + $this->assertTrue($web->isNotFound); + $this->assertFalse($web->isServerError); + $this->assertFalse($web->isInternalServerError); + } } From 87d48c63e581086c7264716b2f03acbc91468b09 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Fri, 9 Dec 2022 12:45:28 +0100 Subject: [PATCH 05/17] Simplifying methods for status codes --- src/UsesGoutte.php | 80 +++------------------------------------- tests/StatusCodeTest.php | 36 ++++-------------- 2 files changed, 12 insertions(+), 104 deletions(-) diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 8e94bd5..8d73430 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -143,98 +143,28 @@ public function statusCode(): int return $this->client->getResponse()->getStatusCode(); } - public function is2xx(): bool + public function isSuccess(): bool { return $this->statusCode() >= 200 && $this->statusCode() <= 299; } - public function is3xx(): bool - { - return $this->statusCode() >= 300 && $this->statusCode() <= 399; - } - - public function is4xx(): bool + public function isClientError(): bool { return $this->statusCode() >= 400 && $this->statusCode() <= 499; } - public function is5xx(): bool + public function isServerError(): bool { return $this->statusCode() >= 500 && $this->statusCode() <= 599; } - public function is200(): bool - { - return $this->statusCode() === 200; - } - - public function is301(): bool - { - return $this->statusCode() === 301; - } - - public function is302(): bool - { - return $this->statusCode() === 302; - } - - public function is400(): bool - { - return $this->statusCode() === 400; - } - - public function is401(): bool - { - return $this->statusCode() === 401; - } - - public function is402(): bool - { - return $this->statusCode() === 402; - } - - public function is403(): bool - { - return $this->statusCode() === 403; - } - - public function is404(): bool - { - return $this->statusCode() === 404; - } - - public function is500(): bool - { - return $this->statusCode() === 500; - } - - public function isOk(): bool - { - return $this->is200(); - } - - public function isUnauthorized(): bool - { - return $this->is401(); - } - public function isForbidden(): bool { - return $this->is403(); + return $this->statusCode() === 403; } public function isNotFound(): bool { - return $this->is404(); - } - - public function isServerError(): bool - { - return $this->is500(); - } - - public function isInternalServerError(): bool - { - return $this->is500(); + return $this->statusCode() === 404; } } \ No newline at end of file diff --git a/tests/StatusCodeTest.php b/tests/StatusCodeTest.php index 768bdf7..4f09af8 100644 --- a/tests/StatusCodeTest.php +++ b/tests/StatusCodeTest.php @@ -31,24 +31,13 @@ public function testOk() $this->assertSame(200, $web->statusCode); // Check the detailed states. - $this->assertTrue($web->is2xx); - $this->assertFalse($web->is4xx); - $this->assertFalse($web->is5xx); - $this->assertTrue($web->is200); - $this->assertFalse($web->is400); - $this->assertFalse($web->is401); - $this->assertFalse($web->is402); - $this->assertFalse($web->is403); - $this->assertFalse($web->is404); - $this->assertFalse($web->is500); + $this->assertTrue($web->isSuccess); + $this->assertFalse($web->isClientError); + $this->assertFalse($web->isServerError); // Assert access-helpers - $this->assertTrue($web->isOk); - $this->assertFalse($web->isUnauthorized); $this->assertFalse($web->isForbidden); $this->assertFalse($web->isNotFound); - $this->assertFalse($web->isServerError); - $this->assertFalse($web->isInternalServerError); } /** @@ -58,30 +47,19 @@ public function testNotFound() { $web = new \Spekulatius\PHPScraper\PHPScraper; - // Navigate to the test page: This redirects to phpscraper.de + // Navigate to the test page which doesn't exist. $web->go('https://test-pages.phpscraper.de/page-does-not-exist.html'); // Check the status itself. $this->assertSame(404, $web->statusCode); // Check the detailed states. - $this->assertFalse($web->is2xx); - $this->assertTrue($web->is4xx); - $this->assertFalse($web->is5xx); - $this->assertFalse($web->is200); - $this->assertFalse($web->is400); - $this->assertFalse($web->is401); - $this->assertFalse($web->is402); - $this->assertFalse($web->is403); - $this->assertTrue($web->is404); - $this->assertFalse($web->is500); + $this->assertFalse($web->isSuccess); + $this->assertTrue($web->isClientError); + $this->assertFalse($web->isServerError); // Assert access-helpers - $this->assertFalse($web->isOk); - $this->assertFalse($web->isUnauthorized); $this->assertFalse($web->isForbidden); $this->assertTrue($web->isNotFound); - $this->assertFalse($web->isServerError); - $this->assertFalse($web->isInternalServerError); } } From 6d7fd5ade6d5f29557df69ee97b0c187ddf8477b Mon Sep 17 00:00:00 2001 From: eposjk Date: Sun, 11 Dec 2022 02:48:53 +0100 Subject: [PATCH 06/17] first try: helper functions for handling response status --- demo.php | 39 +++++++++++++++++ src/GoutteClient.php | 102 +++++++++++++++++++++++++++++++++++++++++++ src/PHPScraper.php | 1 - src/UsesGoutte.php | 36 ++++++++++++++- 4 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 demo.php create mode 100644 src/GoutteClient.php diff --git a/demo.php b/demo.php new file mode 100644 index 0000000..861768e --- /dev/null +++ b/demo.php @@ -0,0 +1,39 @@ +go($url); +//var_dump($web->client); + +if($web->currentUrl !== $url) + echo 'redirected to ', $web->currentUrl, "\n"; +echo 'status code ', $web->statusCode, "\n"; + + + +if($web->isGone) { + echo "delete/deactivate record from database\n"; +} else { + if($web->permanentRedirectUrl !== '') { + echo 'url changed - update url in database to ', $web->permanentRedirectUrl, "\n"; + } + + $retryAt = $web->retryAt; + if($web->isSuccess) { + echo "got data successfully - process it now...\n"; + } elseif($web->isTemporaryResult) { + echo "temporary error\n"; + if(!$retryAt) + $retryAt = time() + 15*60; // FIXME: use longer times if we get the same status code multiple times + } else { + echo "might be a permanent error - but who knows if the server changes its mind (e.g. if the result is caused by some administrative work on the server) --> try several times before considering it final\n"; + if(!$retryAt) + $retryAt = time() + 24*60*60; // FIXME: use longer times if we get the same status code multiple times OR consider it somewhen really permanent and delete/deactivate record from database + } + if($retryAt) + echo 'retry at ', date('Y-m-d H:i:s', $retryAt), "\n"; +} diff --git a/src/GoutteClient.php b/src/GoutteClient.php new file mode 100644 index 0000000..15f09a3 --- /dev/null +++ b/src/GoutteClient.php @@ -0,0 +1,102 @@ +internalResponse->getStatusCode(); + if($status === 200 /* META REFRESH */ || $status === 301 /* Moved Permanently */ || $status === 308 /* Permanent Redirect */) { + if(!$this->usesTemporaryRedirect && empty($this->internalResponse->getHeader('Retry-After'))) + $this->permanentRedirectUrl = $this->redirect; + } else { // $status === 300 /* Multiple Choices */ || $status === 302 /* Found */ || $status === 303 /* See Other */ || $status === 307 /* Temporary Redirect */ + $this->usesTemporaryRedirect = true; + } + // 300 Multiple Choices might also be handled as permanent redirect + // META REFRESH might also be handled as temporary redirect if the delay is > 1s + return parent::followRedirect(); + } + + /** + * Evaluate the Retry-After header + * + * see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After + * + * @return Response + */ + protected function filterResponse(object $response) + { + $retryAfterHeaders = $response->getHeader('Retry-After', false); + if(!empty($retryAfterHeaders)) { + $status = $this->internalResponse->getStatusCode(); + foreach($retryAfterHeaders as $retryAfter) { + if(is_numeric($retryAfter)) + $retryAt = time() + $retryAfter; + else + $retryAt = strtotime($retryAfter); + if($status >= 400) { // usually 429 Too Many Request or 503 Service Unavailable + if($this->retryFailureAt < $retryAt) + $this->retryFailureAt = $retryAt; + } elseif($status >= 300) { + if($this->retryRedirectAt > $retryAt) + $this->retryRedirectAt = $retryAt; + } + } + } + return parent::filterResponse($response); + } + + /** + * Calculate the earliest moment to retry the request + * + * @return Response + */ + public function retryAt(): int { + if($this->retryFailureAt) + return $this->retryFailureAt; + if($this->retryRedirectAt < PHP_INT_MAX) + return $this->retryRedirectAt; + return 0; + } +} \ No newline at end of file diff --git a/src/PHPScraper.php b/src/PHPScraper.php index 8fe88be..f517a7c 100644 --- a/src/PHPScraper.php +++ b/src/PHPScraper.php @@ -8,7 +8,6 @@ * Most calls are passed through to the Core class. */ -use Goutte\Client as GoutteClient; use Symfony\Component\HttpClient\HttpClient as SymfonyHttpClient; class PHPScraper diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 8d73430..24a6bf3 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -2,7 +2,6 @@ namespace Spekulatius\PHPScraper; -use Goutte\Client as GoutteClient; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; @@ -134,6 +133,41 @@ public function clickLink($titleOrUrl): self return $this; } + public function usesTemporaryRedirect(): bool + { + return $this->client ? $this->client->usesTemporaryRedirect : false; + } + + public function isTemporaryResult(): bool + { + return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [408 /* Request Timeout */, 409 /* Conflict */, 419 /* Page Expired */, 420 /* Enhance Your Calm */, 421 /* Misdirected Request */, 423 /* Locked */, 425 /* Too Early */, 429 /* Too Many Requests */, 500 /* Internal Server Error */, 502 /* Bad Gateway */, 503 /* Service Unavailable */, 504 /* Gateway Timeout */, 507 /* Insufficient Storage */, 520 /* Web Server returned an unknown error */, 521 /* Web server is down */, 522 /* Connection Timed Out */, 523 /* Origin is unreachable */, 524 /* A timeout occurred */, 525 /* SSL Handshake Failed */, 527 /* Railgun Error */, 529 /* Site is overloaded */, 598 /* Network read timeout error */, 599 /* Network Connect Timeout Error */ ]); + } + + public function isGone(): bool + { + return !$this->isTemporaryResult() && $this->statusCode() === 410 /* Gone */; + } + + public function isPermanentError(): bool + { + return $this->statusCode() >= 400 && !$this->isTemporaryResult(); + } + + public function permanentRedirectUrl(): string + { + return $this->client ? ($this->client->permanentRedirectUrl ?? '') : ''; + } + + public function retryAt(): int + { + $retryAt = $this->client ? ($this->client->retryAt()) : 0; + if($retryAt) + return $retryAt; + if($this->statusCode() === 509 /* Bandwidth Limit Exceeded */) + return strtotime('next month 12:00 UTC'); // give providers in each timezone the chance to reset the traffic quota for month + return 0; + } + public function statusCode(): int { if ($this->currentPage === null) { From 83ab28a3c99825b4c0de5849394ceed030462189 Mon Sep 17 00:00:00 2001 From: eposjk Date: Mon, 12 Dec 2022 00:15:42 +0100 Subject: [PATCH 07/17] fixed coding style --- demo.php | 30 ++++++++++++++++-------------- src/GoutteClient.php | 38 +++++++++++++++++++++++--------------- src/UsesGoutte.php | 36 +++++++++++++++++++++++++++++++----- 3 files changed, 70 insertions(+), 34 deletions(-) diff --git a/demo.php b/demo.php index 861768e..54e191c 100644 --- a/demo.php +++ b/demo.php @@ -1,39 +1,41 @@ go($url); //var_dump($web->client); -if($web->currentUrl !== $url) +if ($web->currentUrl !== $url) { echo 'redirected to ', $web->currentUrl, "\n"; +} echo 'status code ', $web->statusCode, "\n"; - - -if($web->isGone) { +if ($web->isGone) { echo "delete/deactivate record from database\n"; } else { - if($web->permanentRedirectUrl !== '') { + if ($web->permanentRedirectUrl !== '') { echo 'url changed - update url in database to ', $web->permanentRedirectUrl, "\n"; } $retryAt = $web->retryAt; - if($web->isSuccess) { + if ($web->isSuccess) { echo "got data successfully - process it now...\n"; - } elseif($web->isTemporaryResult) { + } elseif ($web->isTemporaryResult) { echo "temporary error\n"; - if(!$retryAt) - $retryAt = time() + 15*60; // FIXME: use longer times if we get the same status code multiple times + if (!$retryAt) { + $retryAt = time() + 15*60; + } // FIXME: use longer times if we get the same status code multiple times } else { echo "might be a permanent error - but who knows if the server changes its mind (e.g. if the result is caused by some administrative work on the server) --> try several times before considering it final\n"; - if(!$retryAt) - $retryAt = time() + 24*60*60; // FIXME: use longer times if we get the same status code multiple times OR consider it somewhen really permanent and delete/deactivate record from database + if (!$retryAt) { + $retryAt = time() + 24*60*60; + } // FIXME: use longer times if we get the same status code multiple times OR consider it somewhen really permanent and delete/deactivate record from database } - if($retryAt) + if ($retryAt) { echo 'retry at ', date('Y-m-d H:i:s', $retryAt), "\n"; + } } diff --git a/src/GoutteClient.php b/src/GoutteClient.php index 15f09a3..f715177 100644 --- a/src/GoutteClient.php +++ b/src/GoutteClient.php @@ -8,7 +8,6 @@ /** * Extended Goutte\Client with PHPScraper specific methods */ - class GoutteClient extends Client { /** @@ -47,9 +46,10 @@ class GoutteClient extends Client public function followRedirect(): Crawler { $status = $this->internalResponse->getStatusCode(); - if($status === 200 /* META REFRESH */ || $status === 301 /* Moved Permanently */ || $status === 308 /* Permanent Redirect */) { - if(!$this->usesTemporaryRedirect && empty($this->internalResponse->getHeader('Retry-After'))) + if ($status === 200 /* META REFRESH */ || $status === 301 /* Moved Permanently */ || $status === 308 /* Permanent Redirect */) { + if (!$this->usesTemporaryRedirect && empty($this->internalResponse->getHeader('Retry-After'))) { $this->permanentRedirectUrl = $this->redirect; + } } else { // $status === 300 /* Multiple Choices */ || $status === 302 /* Found */ || $status === 303 /* See Other */ || $status === 307 /* Temporary Redirect */ $this->usesTemporaryRedirect = true; } @@ -68,22 +68,26 @@ public function followRedirect(): Crawler protected function filterResponse(object $response) { $retryAfterHeaders = $response->getHeader('Retry-After', false); - if(!empty($retryAfterHeaders)) { + if (!empty($retryAfterHeaders)) { $status = $this->internalResponse->getStatusCode(); - foreach($retryAfterHeaders as $retryAfter) { - if(is_numeric($retryAfter)) + foreach ($retryAfterHeaders as $retryAfter) { + if (is_numeric($retryAfter)) { $retryAt = time() + $retryAfter; - else + } else { $retryAt = strtotime($retryAfter); - if($status >= 400) { // usually 429 Too Many Request or 503 Service Unavailable - if($this->retryFailureAt < $retryAt) + } + if ($status >= 400) { // usually 429 Too Many Request or 503 Service Unavailable + if ($this->retryFailureAt < $retryAt) { $this->retryFailureAt = $retryAt; - } elseif($status >= 300) { - if($this->retryRedirectAt > $retryAt) + } + } elseif ($status >= 300) { + if ($this->retryRedirectAt > $retryAt) { $this->retryRedirectAt = $retryAt; + } } } } + return parent::filterResponse($response); } @@ -92,11 +96,15 @@ protected function filterResponse(object $response) * * @return Response */ - public function retryAt(): int { - if($this->retryFailureAt) + public function retryAt(): int + { + if ($this->retryFailureAt) { return $this->retryFailureAt; - if($this->retryRedirectAt < PHP_INT_MAX) + } + if ($this->retryRedirectAt < PHP_INT_MAX) { return $this->retryRedirectAt; + } + return 0; } -} \ No newline at end of file +} diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 24a6bf3..54d4496 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -140,7 +140,31 @@ public function usesTemporaryRedirect(): bool public function isTemporaryResult(): bool { - return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [408 /* Request Timeout */, 409 /* Conflict */, 419 /* Page Expired */, 420 /* Enhance Your Calm */, 421 /* Misdirected Request */, 423 /* Locked */, 425 /* Too Early */, 429 /* Too Many Requests */, 500 /* Internal Server Error */, 502 /* Bad Gateway */, 503 /* Service Unavailable */, 504 /* Gateway Timeout */, 507 /* Insufficient Storage */, 520 /* Web Server returned an unknown error */, 521 /* Web server is down */, 522 /* Connection Timed Out */, 523 /* Origin is unreachable */, 524 /* A timeout occurred */, 525 /* SSL Handshake Failed */, 527 /* Railgun Error */, 529 /* Site is overloaded */, 598 /* Network read timeout error */, 599 /* Network Connect Timeout Error */ ]); + return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [ + 408, // Request Timeout + 409, // Conflict + 419, // Page Expired + 420, // Enhance Your Calm + 421, // Misdirected Request + 423, // Locked + 425, // Too Early + 429, // Too Many Requests + 500, // Internal Server Error + 502, // Bad Gateway + 503, // Service Unavailable + 504, // Gateway Timeout + 507, // Insufficient Storage + 520, // Web Server returned an unknown error + 521, // Web Server is down + 522, // Connection Timed Out + 523, // Origin is unreachable + 524, // A timeout occurred + 525, // SSL Handshake Failed + 527, // Railgun Error + 529, // Site is overloaded + 598, // Network read timeout error + 599, // Network Connect Timeout Error + ]); } public function isGone(): bool @@ -161,10 +185,12 @@ public function permanentRedirectUrl(): string public function retryAt(): int { $retryAt = $this->client ? ($this->client->retryAt()) : 0; - if($retryAt) + if ($retryAt) { return $retryAt; - if($this->statusCode() === 509 /* Bandwidth Limit Exceeded */) - return strtotime('next month 12:00 UTC'); // give providers in each timezone the chance to reset the traffic quota for month + } + if ($this->statusCode() === 509 /* Bandwidth Limit Exceeded */) { + return strtotime('next month 12:00 UTC'); + } // give providers in each timezone the chance to reset the traffic quota for month return 0; } @@ -201,4 +227,4 @@ public function isNotFound(): bool { return $this->statusCode() === 404; } -} \ No newline at end of file +} From 25a2379d5193eb0c74f132ff703a21b6a877c6e2 Mon Sep 17 00:00:00 2001 From: eposjk Date: Mon, 12 Dec 2022 00:56:50 +0100 Subject: [PATCH 08/17] remember request properties from last go() request --- src/GoutteClient.php | 11 +++++++++ src/UsesGoutte.php | 54 ++++++++++++++++++++++++++++++++------------ 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/src/GoutteClient.php b/src/GoutteClient.php index f715177..461d170 100644 --- a/src/GoutteClient.php +++ b/src/GoutteClient.php @@ -38,6 +38,17 @@ class GoutteClient extends Client */ protected $retryFailureAt = 0; + /** + * Reset internal variables + */ + public function initNewRequest() + { + $this->usesTemporaryRedirect = false; + $this->permanentRedirectUrl = null; + $this->retryRedirectAt = PHP_INT_MAX; + $this->retryFailureAt = 0; + } + /** * Remember permanent redirect url and detect if the redirect chain contains temporary redirects * diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 54d4496..df08f4f 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -28,6 +28,27 @@ trait UsesGoutte */ protected $currentPage = null; + /** + * Was a temporary redirect involved in loading this request? + * + * @var bool + */ + protected $usesTemporaryRedirect = false; + + /** + * Should subsequent requests go to a different URL? + * + * @var string + */ + protected $permanentRedirectUrl = ''; + + /** + * Which is the earliest moment to retry the request? (unix timestamp) + * + * @var int + */ + protected $retryAt = 0; + /** * Overwrites the client * @@ -73,9 +94,19 @@ public function client(): GoutteClient */ public function go(string $url): self { + $this->client->initNewRequest(); + // Keep it around for internal processing. $this->currentPage = $this->client->request('GET', $url); + // Remember request properties. + $this->usesTemporaryRedirect = $this->client->usesTemporaryRedirect; + $this->permanentRedirectUrl = $this->client->permanentRedirectUrl ?? ''; + $this->retryAt = $this->client->retryAt(); + if (!$this->retryAt && $this->statusCode() === 509 /* Bandwidth Limit Exceeded */) { + $this->retryAt = strtotime('next month 12:00 UTC'); + // give providers in each timezone the chance to reset the traffic quota for month + } return $this; } @@ -133,14 +164,9 @@ public function clickLink($titleOrUrl): self return $this; } - public function usesTemporaryRedirect(): bool - { - return $this->client ? $this->client->usesTemporaryRedirect : false; - } - public function isTemporaryResult(): bool { - return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [ + return $this->usesTemporaryRedirect || \in_array($this->statusCode(), [ 408, // Request Timeout 409, // Conflict 419, // Page Expired @@ -177,21 +203,19 @@ public function isPermanentError(): bool return $this->statusCode() >= 400 && !$this->isTemporaryResult(); } + public function usesTemporaryRedirect(): bool + { + return $this->usesTemporaryRedirect; + } + public function permanentRedirectUrl(): string { - return $this->client ? ($this->client->permanentRedirectUrl ?? '') : ''; + return $this->permanentRedirectUrl; } public function retryAt(): int { - $retryAt = $this->client ? ($this->client->retryAt()) : 0; - if ($retryAt) { - return $retryAt; - } - if ($this->statusCode() === 509 /* Bandwidth Limit Exceeded */) { - return strtotime('next month 12:00 UTC'); - } // give providers in each timezone the chance to reset the traffic quota for month - return 0; + return $this->retryAt; } public function statusCode(): int From d67c26d83cb659705e7e973c5c8c2ac4ae37694e Mon Sep 17 00:00:00 2001 From: eposjk Date: Mon, 12 Dec 2022 01:46:20 +0100 Subject: [PATCH 09/17] bugfix for retryAt() --- src/GoutteClient.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GoutteClient.php b/src/GoutteClient.php index 461d170..a5ff1e1 100644 --- a/src/GoutteClient.php +++ b/src/GoutteClient.php @@ -80,7 +80,7 @@ protected function filterResponse(object $response) { $retryAfterHeaders = $response->getHeader('Retry-After', false); if (!empty($retryAfterHeaders)) { - $status = $this->internalResponse->getStatusCode(); + $status = $response->getStatusCode(); foreach ($retryAfterHeaders as $retryAfter) { if (is_numeric($retryAfter)) { $retryAt = time() + $retryAfter; From fa1ca44171d61d4084e31f2fb1d7d94efbacf91b Mon Sep 17 00:00:00 2001 From: eposjk Date: Mon, 12 Dec 2022 01:49:24 +0100 Subject: [PATCH 10/17] enhanced status code tests --- tests/StatusCodeTest.php | 148 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 2 deletions(-) diff --git a/tests/StatusCodeTest.php b/tests/StatusCodeTest.php index 4f09af8..78ae631 100644 --- a/tests/StatusCodeTest.php +++ b/tests/StatusCodeTest.php @@ -24,8 +24,8 @@ public function testOk() { $web = new \Spekulatius\PHPScraper\PHPScraper; - // Navigate to the test page: This redirects to phpscraper.de - $web->go('https://phpscraper.de'); + // Navigate to the test page without redirect + $web->go('https://phpscraper.de/'); // Check the status itself. $this->assertSame(200, $web->statusCode); @@ -34,6 +34,14 @@ public function testOk() $this->assertTrue($web->isSuccess); $this->assertFalse($web->isClientError); $this->assertFalse($web->isServerError); + $this->assertFalse($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); // Assert access-helpers $this->assertFalse($web->isForbidden); @@ -57,9 +65,145 @@ public function testNotFound() $this->assertFalse($web->isSuccess); $this->assertTrue($web->isClientError); $this->assertFalse($web->isServerError); + $this->assertFalse($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertTrue($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); // Assert access-helpers $this->assertFalse($web->isForbidden); $this->assertTrue($web->isNotFound); } + + /** + * @test + */ + public function testPermanentRedirect() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page with 301 permanent redirect + $web->go('http://phpscraper.de/'); + + // Check the status itself. + $this->assertSame(200, $web->statusCode); + + // Check the detailed states. + $this->assertTrue($web->isSuccess); + $this->assertFalse($web->isClientError); + $this->assertFalse($web->isServerError); + $this->assertFalse($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('https://phpscraper.de/', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + + // Assert access-helpers + $this->assertFalse($web->isForbidden); + $this->assertFalse($web->isNotFound); + } + + /** + * @test + */ + public function testTemporaryRedirect() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page with 307 temporary redirect + $web->go('https://httpstat.us/307'); + + // Check the status itself. + $this->assertSame(200, $web->statusCode); + + // Check the detailed states. + $this->assertTrue($web->isSuccess); + $this->assertFalse($web->isClientError); + $this->assertFalse($web->isServerError); + $this->assertTrue($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertTrue($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + + // Assert access-helpers + $this->assertFalse($web->isForbidden); + $this->assertFalse($web->isNotFound); + } + + /** + * @test + */ + public function testGone() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page + $web->go('https://httpstat.us/410'); + + // Check the status itself. + $this->assertSame(410, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertTrue($web->isClientError); + $this->assertFalse($web->isServerError); + $this->assertFalse($web->isTemporaryResult); + $this->assertTrue($web->isGone); + $this->assertTrue($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + + // Assert access-helpers + $this->assertFalse($web->isForbidden); + $this->assertFalse($web->isNotFound); + } + + /** + * @test + */ + public function testTooManyRequests() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page which returns "429 Too Many Requests" with "Retry-At: 5" header + $t1 = time(); + $web->go('https://httpstat.us/429'); + $t2 = time(); + + // Check the status itself. + $this->assertSame(429, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertTrue($web->isClientError); + $this->assertFalse($web->isServerError); + $this->assertTrue($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertGreaterThan($t1, $web->retryAt); + $this->assertLessThanOrEqual($t2 + 5, $web->retryAt); + + // Assert access-helpers + $this->assertFalse($web->isForbidden); + $this->assertFalse($web->isNotFound); + } + } From 646871dcfbd94cee0dc67304a844b84a29e732eb Mon Sep 17 00:00:00 2001 From: eposjk Date: Mon, 12 Dec 2022 01:53:34 +0100 Subject: [PATCH 11/17] removed isClientError() / isServerError() / isForbidden() / isNotFound() --- src/UsesGoutte.php | 19 ------------------- tests/StatusCodeTest.php | 36 ------------------------------------ 2 files changed, 55 deletions(-) diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index df08f4f..0bc6b58 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -232,23 +232,4 @@ public function isSuccess(): bool return $this->statusCode() >= 200 && $this->statusCode() <= 299; } - public function isClientError(): bool - { - return $this->statusCode() >= 400 && $this->statusCode() <= 499; - } - - public function isServerError(): bool - { - return $this->statusCode() >= 500 && $this->statusCode() <= 599; - } - - public function isForbidden(): bool - { - return $this->statusCode() === 403; - } - - public function isNotFound(): bool - { - return $this->statusCode() === 404; - } } diff --git a/tests/StatusCodeTest.php b/tests/StatusCodeTest.php index 78ae631..b7fb5e7 100644 --- a/tests/StatusCodeTest.php +++ b/tests/StatusCodeTest.php @@ -32,8 +32,6 @@ public function testOk() // Check the detailed states. $this->assertTrue($web->isSuccess); - $this->assertFalse($web->isClientError); - $this->assertFalse($web->isServerError); $this->assertFalse($web->isTemporaryResult); $this->assertFalse($web->isGone); $this->assertFalse($web->isPermanentError); @@ -42,10 +40,6 @@ public function testOk() $this->assertFalse($web->usesTemporaryRedirect); $this->assertSame('', $web->permanentRedirectUrl); $this->assertSame(0, $web->retryAt); - - // Assert access-helpers - $this->assertFalse($web->isForbidden); - $this->assertFalse($web->isNotFound); } /** @@ -63,8 +57,6 @@ public function testNotFound() // Check the detailed states. $this->assertFalse($web->isSuccess); - $this->assertTrue($web->isClientError); - $this->assertFalse($web->isServerError); $this->assertFalse($web->isTemporaryResult); $this->assertFalse($web->isGone); $this->assertTrue($web->isPermanentError); @@ -73,10 +65,6 @@ public function testNotFound() $this->assertFalse($web->usesTemporaryRedirect); $this->assertSame('', $web->permanentRedirectUrl); $this->assertSame(0, $web->retryAt); - - // Assert access-helpers - $this->assertFalse($web->isForbidden); - $this->assertTrue($web->isNotFound); } /** @@ -94,8 +82,6 @@ public function testPermanentRedirect() // Check the detailed states. $this->assertTrue($web->isSuccess); - $this->assertFalse($web->isClientError); - $this->assertFalse($web->isServerError); $this->assertFalse($web->isTemporaryResult); $this->assertFalse($web->isGone); $this->assertFalse($web->isPermanentError); @@ -104,10 +90,6 @@ public function testPermanentRedirect() $this->assertFalse($web->usesTemporaryRedirect); $this->assertSame('https://phpscraper.de/', $web->permanentRedirectUrl); $this->assertSame(0, $web->retryAt); - - // Assert access-helpers - $this->assertFalse($web->isForbidden); - $this->assertFalse($web->isNotFound); } /** @@ -125,8 +107,6 @@ public function testTemporaryRedirect() // Check the detailed states. $this->assertTrue($web->isSuccess); - $this->assertFalse($web->isClientError); - $this->assertFalse($web->isServerError); $this->assertTrue($web->isTemporaryResult); $this->assertFalse($web->isGone); $this->assertFalse($web->isPermanentError); @@ -135,10 +115,6 @@ public function testTemporaryRedirect() $this->assertTrue($web->usesTemporaryRedirect); $this->assertSame('', $web->permanentRedirectUrl); $this->assertSame(0, $web->retryAt); - - // Assert access-helpers - $this->assertFalse($web->isForbidden); - $this->assertFalse($web->isNotFound); } /** @@ -156,8 +132,6 @@ public function testGone() // Check the detailed states. $this->assertFalse($web->isSuccess); - $this->assertTrue($web->isClientError); - $this->assertFalse($web->isServerError); $this->assertFalse($web->isTemporaryResult); $this->assertTrue($web->isGone); $this->assertTrue($web->isPermanentError); @@ -166,10 +140,6 @@ public function testGone() $this->assertFalse($web->usesTemporaryRedirect); $this->assertSame('', $web->permanentRedirectUrl); $this->assertSame(0, $web->retryAt); - - // Assert access-helpers - $this->assertFalse($web->isForbidden); - $this->assertFalse($web->isNotFound); } /** @@ -189,8 +159,6 @@ public function testTooManyRequests() // Check the detailed states. $this->assertFalse($web->isSuccess); - $this->assertTrue($web->isClientError); - $this->assertFalse($web->isServerError); $this->assertTrue($web->isTemporaryResult); $this->assertFalse($web->isGone); $this->assertFalse($web->isPermanentError); @@ -200,10 +168,6 @@ public function testTooManyRequests() $this->assertSame('', $web->permanentRedirectUrl); $this->assertGreaterThan($t1, $web->retryAt); $this->assertLessThanOrEqual($t2 + 5, $web->retryAt); - - // Assert access-helpers - $this->assertFalse($web->isForbidden); - $this->assertFalse($web->isNotFound); } } From fd9006b1c80e80eca6e6c543d6e3930c681e42a5 Mon Sep 17 00:00:00 2001 From: eposjk Date: Mon, 12 Dec 2022 01:58:36 +0100 Subject: [PATCH 12/17] --- demo.php | 1 - 1 file changed, 1 deletion(-) diff --git a/demo.php b/demo.php index 54e191c..36c0916 100644 --- a/demo.php +++ b/demo.php @@ -7,7 +7,6 @@ echo 'requesting ', $url, "\n"; $web = new \Spekulatius\PHPScraper\PHPScraper(); $web->go($url); -//var_dump($web->client); if ($web->currentUrl !== $url) { echo 'redirected to ', $web->currentUrl, "\n"; From 8bc4c11a63477ce08549a395ddbcedab98acfaca Mon Sep 17 00:00:00 2001 From: eposjk Date: Fri, 16 Dec 2022 23:00:18 +0100 Subject: [PATCH 13/17] moved demo.php to examples/CheckStatus.php --- demo.php => examples/CheckStatus.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename demo.php => examples/CheckStatus.php (97%) diff --git a/demo.php b/examples/CheckStatus.php similarity index 97% rename from demo.php rename to examples/CheckStatus.php index 36c0916..d4a5eba 100644 --- a/demo.php +++ b/examples/CheckStatus.php @@ -1,6 +1,6 @@ Date: Fri, 16 Dec 2022 23:01:00 +0100 Subject: [PATCH 14/17] fixed retryAt() docblock --- src/GoutteClient.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GoutteClient.php b/src/GoutteClient.php index a5ff1e1..62dfcd1 100644 --- a/src/GoutteClient.php +++ b/src/GoutteClient.php @@ -105,7 +105,7 @@ protected function filterResponse(object $response) /** * Calculate the earliest moment to retry the request * - * @return Response + * @return int */ public function retryAt(): int { From 6bee2d7e7b6d9894179fa61aa3b73b99a74491d4 Mon Sep 17 00:00:00 2001 From: eposjk Date: Sat, 17 Dec 2022 00:01:21 +0100 Subject: [PATCH 15/17] keep state in GoutteClient --- src/GoutteClient.php | 37 ++++++++++++++++++++++++++++------- src/UsesGoutte.php | 46 +++++++++++--------------------------------- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/src/GoutteClient.php b/src/GoutteClient.php index 62dfcd1..a4bc232 100644 --- a/src/GoutteClient.php +++ b/src/GoutteClient.php @@ -10,6 +10,15 @@ */ class GoutteClient extends Client { + /** + * Is this the main request or a subrequest? + * + * (should always contain the same value as the private parent::$isMainRequest) + * + * @var bool + */ + private $isMainRequest = true; + /** * Was a temporary redirect involved in loading this request? * @@ -39,14 +48,25 @@ class GoutteClient extends Client protected $retryFailureAt = 0; /** - * Reset internal variables + * Reset internal variables before calling a URI. + * + * @param string $method The request method + * @param string $uri The URI to fetch + * @param array $parameters The Request parameters + * @param array $files The files + * @param array $server The server parameters (HTTP headers are referenced with an HTTP_ prefix as PHP does) + * @param string $content The raw body data + * @param bool $changeHistory Whether to update the history or not (only used internally for back(), forward(), and reload()) */ - public function initNewRequest() + public function request(string $method, string $uri, array $parameters = [], array $files = [], array $server = [], string $content = null, bool $changeHistory = true): Crawler { - $this->usesTemporaryRedirect = false; - $this->permanentRedirectUrl = null; - $this->retryRedirectAt = PHP_INT_MAX; - $this->retryFailureAt = 0; + if ($this->isMainRequest) { + $this->usesTemporaryRedirect = false; + $this->permanentRedirectUrl = null; + $this->retryRedirectAt = PHP_INT_MAX; + $this->retryFailureAt = 0; + } + return parent::request($method, $uri, $parameters, $files, $server, $content, $changeHistory); } /** @@ -56,6 +76,7 @@ public function initNewRequest() */ public function followRedirect(): Crawler { + $this->isMainRequest = false; $status = $this->internalResponse->getStatusCode(); if ($status === 200 /* META REFRESH */ || $status === 301 /* Moved Permanently */ || $status === 308 /* Permanent Redirect */) { if (!$this->usesTemporaryRedirect && empty($this->internalResponse->getHeader('Retry-After'))) { @@ -66,7 +87,9 @@ public function followRedirect(): Crawler } // 300 Multiple Choices might also be handled as permanent redirect // META REFRESH might also be handled as temporary redirect if the delay is > 1s - return parent::followRedirect(); + $response = parent::followRedirect(); + $this->isMainRequest = true; + return $response; } /** diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 0bc6b58..8fac8e2 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -28,27 +28,6 @@ trait UsesGoutte */ protected $currentPage = null; - /** - * Was a temporary redirect involved in loading this request? - * - * @var bool - */ - protected $usesTemporaryRedirect = false; - - /** - * Should subsequent requests go to a different URL? - * - * @var string - */ - protected $permanentRedirectUrl = ''; - - /** - * Which is the earliest moment to retry the request? (unix timestamp) - * - * @var int - */ - protected $retryAt = 0; - /** * Overwrites the client * @@ -94,19 +73,9 @@ public function client(): GoutteClient */ public function go(string $url): self { - $this->client->initNewRequest(); - // Keep it around for internal processing. $this->currentPage = $this->client->request('GET', $url); - // Remember request properties. - $this->usesTemporaryRedirect = $this->client->usesTemporaryRedirect; - $this->permanentRedirectUrl = $this->client->permanentRedirectUrl ?? ''; - $this->retryAt = $this->client->retryAt(); - if (!$this->retryAt && $this->statusCode() === 509 /* Bandwidth Limit Exceeded */) { - $this->retryAt = strtotime('next month 12:00 UTC'); - // give providers in each timezone the chance to reset the traffic quota for month - } return $this; } @@ -166,7 +135,7 @@ public function clickLink($titleOrUrl): self public function isTemporaryResult(): bool { - return $this->usesTemporaryRedirect || \in_array($this->statusCode(), [ + return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [ 408, // Request Timeout 409, // Conflict 419, // Page Expired @@ -205,17 +174,24 @@ public function isPermanentError(): bool public function usesTemporaryRedirect(): bool { - return $this->usesTemporaryRedirect; + return $this->client ? $this->client->usesTemporaryRedirect : false; } public function permanentRedirectUrl(): string { - return $this->permanentRedirectUrl; + return $this->client ? ($this->client->permanentRedirectUrl ?? '') : ''; } public function retryAt(): int { - return $this->retryAt; + $retryAt = $this->client ? ($this->client->retryAt()) : 0; + if ($retryAt) { + return $retryAt; + } + if ($this->statusCode() === 509 /* Bandwidth Limit Exceeded */) { + return strtotime('next month 12:00 UTC'); + } // give providers in each timezone the chance to reset the traffic quota for month + return 0; } public function statusCode(): int From d5d98bc98ac6f273cc7cf44985c8e41989e79a58 Mon Sep 17 00:00:00 2001 From: eposjk Date: Thu, 22 Dec 2022 01:40:21 +0100 Subject: [PATCH 16/17] return status 499 for timeout and 0 for network errors instead of throwing exceptions --- src/GoutteClient.php | 18 ++++++++++++++- src/UsesGoutte.php | 4 +++- tests/StatusCodeTest.php | 50 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/src/GoutteClient.php b/src/GoutteClient.php index a4bc232..2710a88 100644 --- a/src/GoutteClient.php +++ b/src/GoutteClient.php @@ -4,6 +4,9 @@ use Goutte\Client; use Symfony\Component\DomCrawler\Crawler; +use Symfony\Component\BrowserKit\Response; +use Symfony\Component\HttpClient\Exception\TimeoutException; +use Symfony\Contracts\HttpClient\Exception\TransportExceptionInterface; /** * Extended Goutte\Client with PHPScraper specific methods @@ -66,7 +69,20 @@ public function request(string $method, string $uri, array $parameters = [], arr $this->retryRedirectAt = PHP_INT_MAX; $this->retryFailureAt = 0; } - return parent::request($method, $uri, $parameters, $files, $server, $content, $changeHistory); + try { + return parent::request($method, $uri, $parameters, $files, $server, $content, $changeHistory); + } catch (TimeoutException $e) { + $content = $e->getMessage(); + $status = 499; // Client Closed Request + } catch (TransportExceptionInterface $e) { + $content = $e->getMessage(); + $status = 0; // Network Error + } + $this->response = new Response($content, $status, ['Content-Type' => 'text/plain', 'Content-Length' => strlen($content), 'Date' => gmdate('D, d M Y H:i:s T')]); + $this->internalResponse = $this->filterResponse($this->response); + $this->redirect = null; + $this->crawler = $this->createCrawlerFromContent($this->internalRequest->getUri(), $this->internalResponse->getContent(), $this->internalResponse->getHeader('Content-Type') ?? ''); + return $this->crawler; } /** diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 8fac8e2..969d09c 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -136,6 +136,7 @@ public function clickLink($titleOrUrl): self public function isTemporaryResult(): bool { return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [ + 0, // Network Error 408, // Request Timeout 409, // Conflict 419, // Page Expired @@ -144,6 +145,7 @@ public function isTemporaryResult(): bool 423, // Locked 425, // Too Early 429, // Too Many Requests + 499, // Client Closed Request (Timeout) 500, // Internal Server Error 502, // Bad Gateway 503, // Service Unavailable @@ -169,7 +171,7 @@ public function isGone(): bool public function isPermanentError(): bool { - return $this->statusCode() >= 400 && !$this->isTemporaryResult(); + return (!$this->statusCode() || $this->statusCode() >= 400) && !$this->isTemporaryResult(); } public function usesTemporaryRedirect(): bool diff --git a/tests/StatusCodeTest.php b/tests/StatusCodeTest.php index b7fb5e7..90f73c6 100644 --- a/tests/StatusCodeTest.php +++ b/tests/StatusCodeTest.php @@ -170,4 +170,54 @@ public function testTooManyRequests() $this->assertLessThanOrEqual($t2 + 5, $web->retryAt); } + /** + * @test + */ + public function testNetworkError() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page which is invalid + $web->go('https://example.tld/'); + + // Check the status itself. + $this->assertSame(0, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertTrue($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + + /** + * @test + */ + public function testTimeout() + { + $web = new \Spekulatius\PHPScraper\PHPScraper(['timeout' => 0]); + + // Navigate to the test page + $web->go('https://phpscraper.de/'); + + // Check the status itself. + $this->assertSame(499, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertTrue($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + } From c881f73511e325f38d2caf011def01861d026d6b Mon Sep 17 00:00:00 2001 From: eposjk Date: Thu, 22 Dec 2022 23:51:43 +0100 Subject: [PATCH 17/17] define status 0 Network Error (e.g. caused by invalid domains) as permanent error --- src/UsesGoutte.php | 1 - tests/StatusCodeTest.php | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 969d09c..2fc68a8 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -136,7 +136,6 @@ public function clickLink($titleOrUrl): self public function isTemporaryResult(): bool { return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [ - 0, // Network Error 408, // Request Timeout 409, // Conflict 419, // Page Expired diff --git a/tests/StatusCodeTest.php b/tests/StatusCodeTest.php index 90f73c6..e8c2771 100644 --- a/tests/StatusCodeTest.php +++ b/tests/StatusCodeTest.php @@ -185,9 +185,9 @@ public function testNetworkError() // Check the detailed states. $this->assertFalse($web->isSuccess); - $this->assertTrue($web->isTemporaryResult); + $this->assertFalse($web->isTemporaryResult); $this->assertFalse($web->isGone); - $this->assertFalse($web->isPermanentError); + $this->assertTrue($web->isPermanentError); // Check the request properties $this->assertFalse($web->usesTemporaryRedirect);