Skip to content

Commit 4cb9880

Browse files
authored
add scopus csv (#38)
* Start model validating * Draft scopus csv parser * Add functional test * Split on ; in the end * Add verbose option to the cli
1 parent 21d1b71 commit 4cb9880

File tree

5 files changed

+767
-3
lines changed

5 files changed

+767
-3
lines changed

docs/examples/scopus.csv

Lines changed: 582 additions & 0 deletions
Large diffs are not rendered by default.

src/bibx/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from bibx.article import Article
88
from bibx.builders.openalex import EnrichReferences, OpenAlexCollectionBuilder
99
from bibx.builders.scopus_bib import ScopusBibCollectionBuilder
10+
from bibx.builders.scopus_csv import ScopusCsvCollectionBuilder
1011
from bibx.builders.scopus_ris import ScopusRisCollectionBuilder
1112
from bibx.builders.wos import WosCollectionBuilder
1213
from bibx.collection import Collection
@@ -56,6 +57,15 @@ def read_scopus_ris(*files: TextIO) -> Collection:
5657
return ScopusRisCollectionBuilder(*files).build()
5758

5859

60+
def read_scopus_csv(*files: TextIO) -> Collection:
61+
"""Take any number of csv files from scopus and generates a collection.
62+
63+
:param files: Scopus csv files open.
64+
:return: the collection
65+
"""
66+
return ScopusCsvCollectionBuilder(*files).build()
67+
68+
5969
def read_wos(*files: TextIO) -> Collection:
6070
"""Take any number of wos text files and returns a collection.
6171
@@ -67,7 +77,7 @@ def read_wos(*files: TextIO) -> Collection:
6777

6878
def read_any(file: TextIO) -> Collection:
6979
"""Try to read a file with the supported formats."""
70-
for handler in (read_wos, read_scopus_ris, read_scopus_bib):
80+
for handler in (read_wos, read_scopus_ris, read_scopus_bib, read_scopus_csv):
7181
try:
7282
return handler(file)
7383
except BibXError as e:

src/bibx/builders/scopus_csv.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
"""CSV based builder for Scopus data."""
2+
3+
import csv
4+
import logging
5+
from collections.abc import Generator
6+
from typing import Annotated, Optional, TextIO
7+
8+
from pydantic import BaseModel, Field
9+
from pydantic.functional_validators import BeforeValidator
10+
11+
from bibx.article import Article
12+
from bibx.collection import Collection
13+
14+
from .base import CollectionBuilder
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
def _str_or_none(value: Optional[str]) -> Optional[str]:
20+
return value if value else None
21+
22+
23+
def _split_str(value: Optional[str]) -> list[str]:
24+
return value.strip().split("; ") if value else []
25+
26+
27+
class Row(BaseModel):
28+
"""Row model for Scopus CSV data."""
29+
30+
authors: Annotated[
31+
list[str],
32+
Field(validation_alias='"Authors"'),
33+
BeforeValidator(_split_str),
34+
]
35+
year: Annotated[int, Field(validation_alias="Year")]
36+
title: Annotated[str, Field(validation_alias="Title")]
37+
journal: Annotated[str, Field(validation_alias="Abbreviated Source Title")]
38+
volume: Annotated[
39+
Optional[str],
40+
Field(validation_alias="Volume"),
41+
BeforeValidator(_str_or_none),
42+
]
43+
issue: Annotated[
44+
Optional[str],
45+
Field(validation_alias="Issue"),
46+
BeforeValidator(_str_or_none),
47+
]
48+
page: Annotated[
49+
Optional[str],
50+
Field(validation_alias="Page start"),
51+
BeforeValidator(_str_or_none),
52+
]
53+
doi: Annotated[
54+
Optional[str],
55+
Field(validation_alias="DOI"),
56+
BeforeValidator(_str_or_none),
57+
]
58+
cited_by: Annotated[
59+
Optional[int],
60+
Field(validation_alias="Cited by"),
61+
BeforeValidator(_str_or_none),
62+
]
63+
references: Annotated[
64+
list[str],
65+
Field(validation_alias="References"),
66+
BeforeValidator(_split_str),
67+
]
68+
author_keywords: Annotated[
69+
list[str],
70+
Field(validation_alias="Author Keywords"),
71+
BeforeValidator(_split_str),
72+
]
73+
index_keywords: Annotated[
74+
list[str],
75+
Field(validation_alias="Index Keywords"),
76+
BeforeValidator(_split_str),
77+
]
78+
source: Annotated[str, Field(validation_alias="Source")]
79+
80+
81+
class ScopusCsvCollectionBuilder(CollectionBuilder):
82+
"""Builder for Scopus data from CSV files."""
83+
84+
def __init__(self, *files: TextIO) -> None:
85+
self._files = files
86+
for file in self._files:
87+
file.seek(0)
88+
89+
def build(self) -> Collection:
90+
"""Build the collection."""
91+
articles = self._articles_from_files()
92+
return Collection(articles=Collection.deduplicate_articles(list(articles)))
93+
94+
def _articles_from_files(self) -> Generator[Article]:
95+
for file in self._files:
96+
yield from self._parse_file(file)
97+
98+
def _parse_file(self, file: TextIO) -> Generator[Article]:
99+
reader = csv.DictReader(file)
100+
for row in reader:
101+
datum = Row.model_validate(row)
102+
yield (
103+
Article(
104+
label="",
105+
ids=set(),
106+
title=datum.title,
107+
authors=datum.authors,
108+
year=datum.year,
109+
journal=datum.journal,
110+
volume=datum.volume,
111+
issue=datum.issue,
112+
page=datum.page,
113+
doi=datum.doi,
114+
times_cited=datum.cited_by,
115+
references=list(
116+
filter(
117+
None,
118+
[
119+
self._article_from_reference(ref)
120+
for ref in datum.references
121+
],
122+
)
123+
),
124+
keywords=list(set(datum.author_keywords + datum.index_keywords)),
125+
sources={datum.source},
126+
)
127+
.add_simple_id()
128+
.set_simple_label()
129+
)
130+
131+
def _article_from_reference(self, reference: str) -> Optional[Article]:
132+
try:
133+
*authors, journal, issue, year = reference.split(", ")
134+
_year = int(year.lstrip("(").rstrip(")"))
135+
return Article(
136+
label=reference,
137+
ids={reference},
138+
authors=authors,
139+
year=_year,
140+
journal=journal,
141+
issue=issue,
142+
).add_simple_id()
143+
except ValueError:
144+
logger.debug("error parsing reference: %s", reference)
145+
return None

src/bibx/cli.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
from enum import Enum
3+
from typing import Annotated
34

45
import networkx as nx
56
import typer
@@ -9,6 +10,7 @@
910
query_openalex,
1011
read_any,
1112
read_scopus_bib,
13+
read_scopus_csv,
1214
read_scopus_ris,
1315
read_wos,
1416
)
@@ -18,12 +20,24 @@
1820
app = typer.Typer()
1921

2022

23+
@app.callback()
24+
def main(
25+
verbose: Annotated[ # noqa: FBT002
26+
bool, typer.Option("--verbose", "-v", help="Enable verbose logging.")
27+
] = False,
28+
) -> None:
29+
"""BibX is a command-line tool for parsing bibliographic data."""
30+
if verbose:
31+
logging.basicConfig(level=logging.DEBUG)
32+
33+
2134
class Format(Enum):
2235
"""Supported formats."""
2336

2437
WOS = "wos"
2538
RIS = "ris"
2639
BIB = "bib"
40+
CSV = "csv"
2741

2842

2943
@app.command()
@@ -37,12 +51,17 @@ def describe(format: Format, filename: str) -> None:
3751
if format == Format.RIS:
3852
with open(filename) as f:
3953
c = read_scopus_ris(f)
40-
rprint(":boom: the file satisfies the ISI WOS format")
54+
rprint(":boom: the file satisfies the scopus RIS format")
4155
rprint(f"There are {len(c.articles)} records parsed")
4256
if format == Format.BIB:
4357
with open(filename) as f:
4458
c = read_scopus_bib(f)
45-
rprint(":boom: the file satisfies the ISI WOS format")
59+
rprint(":boom: the file satisfies the scopus BIB format")
60+
rprint(f"There are {len(c.articles)} records parsed")
61+
if format == Format.CSV:
62+
with open(filename) as f:
63+
c = read_scopus_csv(f)
64+
rprint(":boom: the file satisfies the scopus CSV format")
4665
rprint(f"There are {len(c.articles)} records parsed")
4766

4867

tests/builders/test_scopus_csv.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from bibx import read_scopus_csv
2+
3+
4+
def test_scopus_csv() -> None:
5+
"""Test the ScopusCSVBuilder class."""
6+
with open("docs/examples/scopus.csv") as file:
7+
collection = read_scopus_csv(file)
8+
assert collection is not None

0 commit comments

Comments
 (0)