Skip to content

Commit ba0be4d

Browse files
committed
wip
1 parent 9a15f48 commit ba0be4d

File tree

5 files changed

+301
-5
lines changed

5 files changed

+301
-5
lines changed

src/plugin/class-controller-registry.php

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,29 @@
44

55
class Controller_Registry {
66

7-
public function __construct( string $liberated_data_post_type, string $crawler_data_post_type ) {
7+
public function __construct( string $liberated_data_post_type, string $crawler_queue_post_type ) {
88
new Blogpost_Controller( $liberated_data_post_type );
99
new Page_Controller( $liberated_data_post_type );
10+
11+
$domain = $this->infer_domain( $liberated_data_post_type );
12+
13+
new Crawler_Controller( $domain, $crawler_queue_post_type );
14+
}
15+
16+
private function infer_domain( $liberated_data_post_type ): string {
17+
$liberated_posts = get_posts(
18+
array(
19+
'post_type' => $liberated_data_post_type,
20+
'posts_per_page' => 1,
21+
'post_status' => 'draft',
22+
)
23+
);
24+
25+
if ( ! empty( $liberated_posts ) ) {
26+
$domain = wp_parse_url( $liberated_posts[0]->guid, -1 );
27+
return $domain['scheme'] . '://' . $domain['host'];
28+
}
29+
30+
return '';
1031
}
1132
}
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
<?php
2+
3+
namespace DotOrg\TryWordPress;
4+
5+
use WP_Error;
6+
use WP_REST_Controller;
7+
use WP_REST_Response;
8+
use WP_REST_Server;
9+
10+
class Crawler_Controller extends WP_REST_Controller {
11+
12+
/**
13+
* Domain is inferred from liberated_data guid
14+
*
15+
* @var string $domain Domain/Host being liberated
16+
*/
17+
private string $domain = '';
18+
19+
private string $crawler_queue_post_type;
20+
21+
public function __construct( string $domain, string $crawler_queue_post_type ) {
22+
$this->domain = $domain;
23+
$this->crawler_queue_post_type = $crawler_queue_post_type;
24+
25+
add_action( 'rest_api_init', array( $this, 'register_routes' ) );
26+
}
27+
28+
public function register_routes(): void {
29+
$version = '1';
30+
$namespace = 'try-wp/v' . $version;
31+
register_rest_route(
32+
$namespace,
33+
'/crawler/next',
34+
array(
35+
array(
36+
'methods' => WP_REST_Server::READABLE,
37+
'callback' => array( $this, 'get_next_url' ),
38+
'permission_callback' => '__return_true',
39+
'args' => array(
40+
'context' => array(
41+
'default' => 'view',
42+
),
43+
),
44+
),
45+
)
46+
);
47+
register_rest_route(
48+
$namespace,
49+
'/crawler/queue',
50+
array(
51+
array(
52+
'methods' => WP_REST_Server::READABLE,
53+
'callback' => array( $this, 'queue_urls' ),
54+
'permission_callback' => '__return_true',
55+
// @TODO Specify args here so that sanitization is handled automatically
56+
'args' => array(
57+
'context' => array(
58+
'default' => 'view',
59+
),
60+
),
61+
),
62+
)
63+
);
64+
}
65+
66+
public function get_next_url( $request ): WP_REST_Response|WP_Error {
67+
$ready_to_crawl_urls = get_posts(
68+
array(
69+
'post_type' => $this->crawler_queue_post_type,
70+
'posts_per_page' => 1,
71+
'post_status' => 'discovered',
72+
'orderby' => 'date',
73+
'order' => 'ASC',
74+
)
75+
);
76+
77+
if ( empty( $ready_to_crawl_urls ) ) {
78+
// have we finished crawling or haven't even started yet?
79+
$crawled_urls = get_posts(
80+
array(
81+
'post_type' => $this->crawler_queue_post_type,
82+
'posts_per_page' => 1,
83+
'post_status' => 'crawled',
84+
'orderby' => 'date',
85+
'order' => 'ASC',
86+
)
87+
);
88+
89+
if ( empty( $crawled_urls ) ) {
90+
// we haven't begun, so return domain itself
91+
return new WP_REST_Response( $this->domain );
92+
}
93+
94+
return new WP_REST_Response( null, 204 );
95+
}
96+
97+
return new WP_REST_Response( $ready_to_crawl_urls[0]->guid );
98+
}
99+
100+
public function queue_urls( $request ): WP_REST_Response|WP_Error {
101+
$request_data = json_decode( $request->get_body(), true );
102+
103+
if ( empty( $request_data['sourceUrl'] ) ) {
104+
return new WP_REST_Response( null, 400 );
105+
}
106+
107+
$post_id = $this->get_post_id_by_guid( $request_data['sourceUrl'] );
108+
if ( empty( $post_id ) ) {
109+
return new WP_REST_Response( null, 404 );
110+
}
111+
112+
$source_url = sanitize_url( $request_data['sourceUrl'] );
113+
$marked = $this->mark_url_as_crawled( $source_url );
114+
if ( is_wp_error( $marked ) ) {
115+
return $marked;
116+
}
117+
118+
foreach ( $request_data['urls']as $url ) {
119+
$queued_result = $this->queue_url( $url );
120+
if ( is_wp_error( $queued_result ) ) {
121+
return $queued_result;
122+
}
123+
}
124+
125+
return new WP_REST_Response();
126+
}
127+
128+
private function queue_url( string $url ): true|WP_Error {
129+
$post_id = $this->get_post_id_by_guid( $url );
130+
131+
// insert only if it's not present
132+
if ( empty( $post_id ) ) {
133+
$inserted_post_id = wp_insert_post(
134+
array(
135+
'post_type' => $this->crawler_queue_post_type,
136+
'guid' => sanitize_url( $url ),
137+
),
138+
true
139+
);
140+
141+
if ( is_wp_error( $inserted_post_id ) ) {
142+
return $inserted_post_id;
143+
}
144+
145+
return true;
146+
}
147+
148+
return true;
149+
}
150+
151+
private function mark_url_as_crawled( $url ): true|WP_Error {
152+
$post_id = $this->get_post_id_by_guid( $url );
153+
$post = get_post( $post_id );
154+
$post->post_status = 'crawled';
155+
if ( wp_update_post( $post ) === $post->ID ) {
156+
return true;
157+
}
158+
159+
return new WP_Error(
160+
'rest_save_failed',
161+
__( 'Failed to update url as crawled', 'try_wordpress' ),
162+
array( 'status' => 500 )
163+
);
164+
}
165+
166+
public function get_post_id_by_guid( string $guid ): ?int {
167+
// Use wp_cache_* for guid -> postId
168+
$cache_group = 'try_wp';
169+
$cache_key = 'try_wp_crawler_cache_guid_' . md5( $guid );
170+
$post_id = wp_cache_get( $cache_key, $cache_group );
171+
172+
if ( false !== $post_id ) {
173+
// Cache hit - get post using WordPress API
174+
$post = get_post( $post_id );
175+
if ( $post ) {
176+
return (int) $post_id;
177+
}
178+
// If post not found despite cache hit, delete the cache
179+
wp_cache_delete( $cache_key, $cache_group );
180+
}
181+
182+
// Cache miss - query database
183+
global $wpdb;
184+
// phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery
185+
$post_id = $wpdb->get_var(
186+
$wpdb->prepare(
187+
"SELECT ID FROM $wpdb->posts WHERE guid = %s",
188+
$guid
189+
)
190+
);
191+
192+
if ( $post_id ) {
193+
// Cache the post ID for future lookups
194+
wp_cache_set( $cache_key, $post_id, $cache_group, YEAR_IN_SECONDS );
195+
return (int) $post_id;
196+
}
197+
198+
return null;
199+
}
200+
}

src/plugin/class-engine.php

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
class Engine {
66

77
public const string LIBERATED_DATA_POST_TYPE = 'liberated_data';
8-
public const string CRAWLER_DATA_POST_TYPE = 'dl_crawler_url';
8+
public const string CRAWLER_QUEUE_POST_TYPE = 'dl_crawler_url';
99

1010
public function __construct() {
1111
require 'enum-subject-type.php';
@@ -15,6 +15,7 @@ public function __construct() {
1515
require 'class-liberate-controller.php';
1616
require 'class-blogpost-controller.php';
1717
require 'class-page-controller.php';
18+
require 'class-crawler-controller.php';
1819
require 'class-controller-registry.php';
1920
require 'class-storage.php';
2021
require 'class-subject.php';
@@ -23,11 +24,11 @@ public function __construct() {
2324
( function () {
2425
$transformer = new Transformer();
2526

26-
new Post_Type_UI( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE, $transformer );
27+
new Post_Type_UI( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE, $transformer );
2728

28-
new Controller_Registry( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE );
29+
new Controller_Registry( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE );
2930

30-
new Storage( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_DATA_POST_TYPE );
31+
new Storage( self::LIBERATED_DATA_POST_TYPE, self::CRAWLER_QUEUE_POST_TYPE );
3132

3233
Subject_Repo::init( self::LIBERATED_DATA_POST_TYPE );
3334
} )();

tests/plugin/base-test.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,13 @@
22
/**
33
* Setup for running tests would come here.
44
*/
5+
6+
// for crawler controller
7+
wp_insert_post(
8+
array(
9+
'post_type' => \DotOrg\TryWordPress\Engine::LIBERATED_DATA_POST_TYPE,
10+
'title' => 'something to avoid empty filter',
11+
'guid' => 'https://example.org/1',
12+
'post_status' => 'draft',
13+
)
14+
);
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
<?php
2+
3+
use DotOrg\TryWordPress\Crawler_Controller;
4+
use PHPUnit\Framework\TestCase;
5+
6+
class Crawler_Controller_Test extends TestCase {
7+
private Crawler_Controller $crawler_controller;
8+
9+
private string $domain = 'https://example.org';
10+
private string $namespace = 'try-wp/v1';
11+
private string $endpoint;
12+
private string $crawler_queue_post_type = 'dl_crawl';
13+
14+
protected function setUp(): void {
15+
parent::setUp();
16+
17+
$this->endpoint = '/' . $this->namespace . '/crawler';
18+
19+
// Note: `base-test.php` sets a `liberated_data` post
20+
21+
$this->crawler_controller = new Crawler_Controller(
22+
$this->domain,
23+
$this->crawler_queue_post_type
24+
);
25+
}
26+
27+
public function testRegisterRoutes(): void {
28+
// do_action( 'rest_api_init' ); // so that register_route() executes.
29+
30+
$routes = rest_get_server()->get_routes( $this->namespace );
31+
$this->assertArrayHasKey( $this->endpoint . '/next', $routes );
32+
$this->assertArrayHasKey( $this->endpoint . '/queue', $routes );
33+
}
34+
35+
/**
36+
* @group failing
37+
*/
38+
public function testGetNextUrlWithoutQueue(): void {
39+
// first fetch should return the domain itself since that's the first url to crawl
40+
$request = new WP_REST_Request( 'GET', $this->endpoint . '/next' );
41+
$response = rest_do_request( $request );
42+
43+
$this->assertEquals( 200, $response->get_status() );
44+
$this->assertEquals( $this->domain, $response->get_data() );
45+
}
46+
47+
public function testQueueUrls(): void {
48+
// first fetch should return the domain itself since that's the first url to crawl
49+
$request = new WP_REST_Request( 'GET', $this->endpoint . '/queue' );
50+
$response = rest_do_request( $request );
51+
52+
$this->assertEquals( 200, $response->get_status() );
53+
$this->assertEquals( $this->domain, $response->get_data() );
54+
}
55+
56+
public function testGetNextUrlFromQueue(): void {
57+
// first fetch should return the domain itself since that's the first url to crawl
58+
$request = new WP_REST_Request( 'GET', $this->endpoint . '/next' );
59+
$response = rest_do_request( $request );
60+
61+
$this->assertEquals( 200, $response->get_status() );
62+
$this->assertEquals( $this->domain, $response->get_data() );
63+
}
64+
}

0 commit comments

Comments
 (0)