Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
582 changes: 582 additions & 0 deletions docs/examples/scopus.csv

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion src/bibx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from bibx.article import Article
from bibx.builders.openalex import EnrichReferences, OpenAlexCollectionBuilder
from bibx.builders.scopus_bib import ScopusBibCollectionBuilder
from bibx.builders.scopus_csv import ScopusCsvCollectionBuilder
from bibx.builders.scopus_ris import ScopusRisCollectionBuilder
from bibx.builders.wos import WosCollectionBuilder
from bibx.collection import Collection
Expand Down Expand Up @@ -56,6 +57,15 @@ def read_scopus_ris(*files: TextIO) -> Collection:
return ScopusRisCollectionBuilder(*files).build()


def read_scopus_csv(*files: TextIO) -> Collection:
"""Take any number of csv files from scopus and generates a collection.
:param files: Scopus csv files open.
:return: the collection
"""
return ScopusCsvCollectionBuilder(*files).build()


def read_wos(*files: TextIO) -> Collection:
"""Take any number of wos text files and returns a collection.
Expand All @@ -67,7 +77,7 @@ def read_wos(*files: TextIO) -> Collection:

def read_any(file: TextIO) -> Collection:
"""Try to read a file with the supported formats."""
for handler in (read_wos, read_scopus_ris, read_scopus_bib):
for handler in (read_wos, read_scopus_ris, read_scopus_bib, read_scopus_csv):
try:
return handler(file)
except BibXError as e:
Expand Down
145 changes: 145 additions & 0 deletions src/bibx/builders/scopus_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""CSV based builder for Scopus data."""

import csv
import logging
from collections.abc import Generator
from typing import Annotated, Optional, TextIO

from pydantic import BaseModel, Field
from pydantic.functional_validators import BeforeValidator

from bibx.article import Article
from bibx.collection import Collection

from .base import CollectionBuilder

logger = logging.getLogger(__name__)


def _str_or_none(value: Optional[str]) -> Optional[str]:
return value if value else None


def _split_str(value: Optional[str]) -> list[str]:
return value.strip().split("; ") if value else []


class Row(BaseModel):
"""Row model for Scopus CSV data."""

authors: Annotated[
list[str],
Field(validation_alias='"Authors"'),
BeforeValidator(_split_str),
]
year: Annotated[int, Field(validation_alias="Year")]
title: Annotated[str, Field(validation_alias="Title")]
journal: Annotated[str, Field(validation_alias="Abbreviated Source Title")]
volume: Annotated[
Optional[str],
Field(validation_alias="Volume"),
BeforeValidator(_str_or_none),
]
issue: Annotated[
Optional[str],
Field(validation_alias="Issue"),
BeforeValidator(_str_or_none),
]
page: Annotated[
Optional[str],
Field(validation_alias="Page start"),
BeforeValidator(_str_or_none),
]
doi: Annotated[
Optional[str],
Field(validation_alias="DOI"),
BeforeValidator(_str_or_none),
]
cited_by: Annotated[
Optional[int],
Field(validation_alias="Cited by"),
BeforeValidator(_str_or_none),
]
references: Annotated[
list[str],
Field(validation_alias="References"),
BeforeValidator(_split_str),
]
author_keywords: Annotated[
list[str],
Field(validation_alias="Author Keywords"),
BeforeValidator(_split_str),
]
index_keywords: Annotated[
list[str],
Field(validation_alias="Index Keywords"),
BeforeValidator(_split_str),
]
source: Annotated[str, Field(validation_alias="Source")]


class ScopusCsvCollectionBuilder(CollectionBuilder):
"""Builder for Scopus data from CSV files."""

def __init__(self, *files: TextIO) -> None:
self._files = files
for file in self._files:
file.seek(0)

def build(self) -> Collection:
"""Build the collection."""
articles = self._articles_from_files()
return Collection(articles=Collection.deduplicate_articles(list(articles)))

def _articles_from_files(self) -> Generator[Article]:
for file in self._files:
yield from self._parse_file(file)

def _parse_file(self, file: TextIO) -> Generator[Article]:
reader = csv.DictReader(file)
for row in reader:
datum = Row.model_validate(row)
yield (
Article(
label="",
ids=set(),
title=datum.title,
authors=datum.authors,
year=datum.year,
journal=datum.journal,
volume=datum.volume,
issue=datum.issue,
page=datum.page,
doi=datum.doi,
times_cited=datum.cited_by,
references=list(
filter(
None,
[
self._article_from_reference(ref)
for ref in datum.references
],
)
),
keywords=list(set(datum.author_keywords + datum.index_keywords)),
sources={datum.source},
)
.add_simple_id()
.set_simple_label()
)

def _article_from_reference(self, reference: str) -> Optional[Article]:
try:
*authors, journal, issue, year = reference.split(", ")
_year = int(year.lstrip("(").rstrip(")"))
return Article(
label=reference,
ids={reference},
authors=authors,
year=_year,
journal=journal,
issue=issue,
).add_simple_id()
except ValueError:
logger.debug("error parsing reference: %s", reference)
return None
23 changes: 21 additions & 2 deletions src/bibx/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from enum import Enum
from typing import Annotated

import networkx as nx
import typer
Expand All @@ -9,6 +10,7 @@
query_openalex,
read_any,
read_scopus_bib,
read_scopus_csv,
read_scopus_ris,
read_wos,
)
Expand All @@ -18,12 +20,24 @@
app = typer.Typer()


@app.callback()
def main(
verbose: Annotated[ # noqa: FBT002
bool, typer.Option("--verbose", "-v", help="Enable verbose logging.")
] = False,
) -> None:
"""BibX is a command-line tool for parsing bibliographic data."""
if verbose:
logging.basicConfig(level=logging.DEBUG)


class Format(Enum):
"""Supported formats."""

WOS = "wos"
RIS = "ris"
BIB = "bib"
CSV = "csv"


@app.command()
Expand All @@ -37,12 +51,17 @@ def describe(format: Format, filename: str) -> None:
if format == Format.RIS:
with open(filename) as f:
c = read_scopus_ris(f)
rprint(":boom: the file satisfies the ISI WOS format")
rprint(":boom: the file satisfies the scopus RIS format")
rprint(f"There are {len(c.articles)} records parsed")
if format == Format.BIB:
with open(filename) as f:
c = read_scopus_bib(f)
rprint(":boom: the file satisfies the ISI WOS format")
rprint(":boom: the file satisfies the scopus BIB format")
rprint(f"There are {len(c.articles)} records parsed")
if format == Format.CSV:
with open(filename) as f:
c = read_scopus_csv(f)
rprint(":boom: the file satisfies the scopus CSV format")
rprint(f"There are {len(c.articles)} records parsed")


Expand Down
8 changes: 8 additions & 0 deletions tests/builders/test_scopus_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from bibx import read_scopus_csv


def test_scopus_csv() -> None:
"""Test the ScopusCSVBuilder class."""
with open("docs/examples/scopus.csv") as file:
collection = read_scopus_csv(file)
assert collection is not None