9/14/2023, 7:46:49 PM
List of available calls
| Route | Descrption | Notice | |
|---|---|---|---|
| GET | /api/robot/blog/updates/{epoch_millis} | Brief information about articles created or updated since {epoch_millis} | size <= 1000 |
| GET | /api/blog/{id} | Article with {id} | |
| GET | /api/blog/attachment?id={id} | Article attachment with {id} | |
| GET | /api/blog/all/{from}/{size} | Articles starting {from} with page {size} | size <= 9 |
| GET | /api/blog/brief/{from}/{size} | Brief information about articles starting {from} with page {size} | size <= 1000 |
Usage
Fetch updates since the beginning and synchronize each article with your database.
After that any subsequent updates call should supply the most recent `updatedDt` from previously synchronized articles + 1 millisecond.
Migration
We have introduced `updatedDt` field to the article, combine it with new updates call to make your crawler updates aware.
As a temporary quick fix you can simply replace the route `/api/blog/all-brief` with `/api/blog/brief/0/1000`.
Also notice that we have limited page size of `/api/blog/all` call to 9 articles.
Example
Download simple crawler implementation written in Python.
import datetime
import http
import json
import logging
import time
import urllib.parse
from http import client
from typing import *
def _get_updated_dt_of(partial_article: dict) -> int:
"""
Helper function to retrieve article's `updatedDt`.
We need to handle the case when `updatedDt` is None and use `createdDt` instead.
"""
for dt_field in ["updatedDt", "createdDt"]:
dt = partial_article.get(dt_field)
if dt is not None:
return dt
return 0
class Database:
"""
Stores articles and attachments in memory.
"""
def __init__(self):
self.articles = dict()
self.attachments = dict()
def get_latest_update_dt(self) -> int:
"""
Scan all articles and return the most recent `updateDt` + 1ms, 0 if none found.
"""
updated_dt: int = 0
for article in self.articles.values():
updated_dt = max(updated_dt, _get_updated_dt_of(article))
if updated_dt > 0:
return updated_dt + 1
return updated_dt
class Api:
"""
Talks to API endpoint.
"""
def __init__(self, host: str, port: int, base_route: str):
self.host = host
self.port = port
self.base_route = base_route
def get_updates(self, since_epoch_millis: int) -> Optional[dict]:
"""
Fetch list of articles updated since epoch millis.
"""
return self._call(f"/robot/blog/updates/{since_epoch_millis}")
def get_article(self, article_id: str) -> Optional[dict]:
"""
Fetch article by id.
"""
return self._call(f"/blog/{article_id}")
def get_attachment(self, attachment_id: str) -> Optional[bytes]:
"""
Fetch attachment by id.
"""
compressed_attachment_id = urllib.parse.quote_plus(f"{attachment_id}/compressed")
return self._call(
f"/blog/attachment?id={compressed_attachment_id}",
output_is_json=False
)
def _call(self, route: str, output_is_json=True) -> any:
"""
Send api call, return json or bytes. None if request failed.
"""
conn = client.HTTPConnection(self.host, self.port)
try:
conn.request(method="GET", url=f"{self.base_route}/{route}")
response = conn.getresponse()
if response.status == http.HTTPStatus.OK:
if output_is_json:
return json.loads(response.read().decode())
else:
return response.read()
else:
logging.error(f"Server responded with HTTP {response.status}")
logging.error(response.read().decode())
finally:
conn.close()
return None
class Crawler:
"""
Handles synchronization routines.
"""
def __init__(self, db: Database, api: Api):
self.db = db
self.api = api
def sync(self):
"""
Synchronize articles and their attachments with database.
"""
latest_update_dt = self.db.get_latest_update_dt()
human_readable_dt = datetime.datetime.fromtimestamp(latest_update_dt / 1000, tz=datetime.timezone.utc)
logging.info(f"Requesting updates since {human_readable_dt}")
updates = self.api.get_updates(latest_update_dt)
if updates is not None and len(updates) > 0:
logging.info(f"Got {len(updates)} updates")
for brief_article in updates:
article_id = brief_article['id']
existing_article = self.db.articles.get(article_id)
if not existing_article or _get_updated_dt_of(existing_article) != _get_updated_dt_of(brief_article):
self.sync_article(article_id)
def sync_article(self, article_id: str):
"""
Synchronize article and it's attachments with database.
"""
logging.info(f"Downloading article {article_id}")
article = self.api.get_article(article_id)
if article is not None:
self.db.articles[article_id] = article
attachments: list[dict] = [article["publication"]["logo"]] + article["publication"]["attachments"]
for attachment in attachments:
if attachment is not None:
attachment_id = attachment['id']
if attachment_id not in self.db.attachments:
self.sync_attachment(attachment_id)
def sync_attachment(self, attachment_id: str):
"""
Synchronize article's attachment with database.
"""
logging.info(f"Downloading attachment {attachment_id}")
attachment = self.api.get_attachment(attachment_id)
if attachment is not None:
self.db.attachments[attachment_id] = attachment
def main():
"""
Run synchronization in an endless loop.
"""
logging.basicConfig(level=logging.INFO)
crawler = Crawler(db=Database(), api=Api(host="127.0.0.1", port=8080, base_route="/blog/api"))
while True:
crawler.sync()
time.sleep(30 * 60)
if __name__ == '__main__':
main()
