Skip to content

Commit 56fe4f5

Browse files
committed
sipstore: create SIPs on record create/update
Signed-off-by: Pamfilos Fokianos <pamfilosf@gmail.com>
1 parent 4d9b3e9 commit 56fe4f5

17 files changed

+744
-40
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,6 @@ target/
6464

6565
# Jetbrains editor project folder
6666
.idea/
67+
68+
# SIPStore
69+
archive/

cernopendata/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,3 +435,6 @@
435435
**params
436436
)
437437
]
438+
439+
SIPSTORE_CHECKSUM_ALGORITHM = 'adler32'
440+

cernopendata/modules/fixtures/cli.py

Lines changed: 134 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,19 @@
3737
from invenio_pidstore.models import PersistentIdentifier
3838
from invenio_records_files.api import Record
3939
from invenio_records_files.models import RecordsBuckets
40+
from invenio_sipstore.models import SIPMetadataType
4041
from sqlalchemy.orm.attributes import flag_modified
4142

4243
from cernopendata.modules.records.minters.docid import \
4344
cernopendata_docid_minter
4445
from cernopendata.modules.records.minters.recid import \
4546
cernopendata_recid_minter
4647

48+
from .sip_utils import (
49+
handle_sipstore_record_file_index,
50+
handle_sipstore_record_file,
51+
sip_record,
52+
)
4753

4854
def get_jsons_from_dir(dir):
4955
"""Get JSON files inside a dir."""
@@ -55,8 +61,9 @@ def get_jsons_from_dir(dir):
5561
return res
5662

5763

58-
def handle_record_files(data, bucket, files, skip_files):
64+
def handle_record_files(data, bucket, files, skip_files, skip_sips):
5965
"""Handles record files."""
66+
sip_files = []
6067
for file in files:
6168
if skip_files:
6269
break
@@ -89,45 +96,80 @@ def handle_record_files(data, bucket, files, skip_files):
8996
str(e)))
9097
continue
9198

99+
if not skip_sips:
100+
if file.get("type", None) == "index.json":
101+
sip_files += handle_sipstore_record_file_index(f)
92102

93-
def create_record(schema, data, files, skip_files):
103+
return sip_files
104+
105+
106+
def handle_sip_files(files, skip_files, skip_sips):
107+
"""Handles record files."""
108+
sip_files = []
109+
for file in files:
110+
if skip_files:
111+
break
112+
assert 'uri' in file
113+
assert 'size' in file
114+
assert 'checksum' in file
115+
f = FileInstance.get_by_uri(file.get("uri"))
116+
117+
if f and not skip_sips:
118+
if file.get("type", None) == "index.json":
119+
sip_files += handle_sipstore_record_file_index(f)
120+
121+
return sip_files
122+
123+
124+
def create_record(schema, data, files, skip_files, skip_sips):
94125
"""Creates a new record."""
95126
id = uuid.uuid4()
96-
cernopendata_recid_minter(id, data)
127+
pid = cernopendata_recid_minter(id, data)
128+
97129
data['$schema'] = schema
98130
record = Record.create(data, id_=id)
99131
if not skip_files:
100132
bucket = Bucket.create()
101-
handle_record_files(data, bucket, files, skip_files)
133+
sip_files_content = handle_record_files(
134+
data, bucket, files, skip_files, skip_sips)
135+
102136
RecordsBuckets.create(
103137
record=record.model, bucket=bucket)
104138

105-
return record
139+
return pid, record, sip_files_content
106140

107141

108-
def update_record(pid, schema, data, files, skip_files):
142+
def update_record(pid, schema, data, files, skip_files, skip_sips):
109143
"""Updates the given record."""
110144
record = Record.get_record(pid.object_uuid)
111-
with db.session.begin_nested():
112-
if record.files and not skip_files:
113-
bucket_id = record.files.bucket
114-
bucket = Bucket.get(bucket_id.id)
115-
for o in ObjectVersion.get_by_bucket(bucket).all():
116-
o.remove()
117-
o.file.delete()
118-
RecordsBuckets.query.filter_by(
119-
record=record.model,
120-
bucket=bucket
121-
).delete()
122-
bucket_id.remove()
123-
db.session.commit()
145+
# with db.session.begin_nested():
146+
# if record.files and not skip_files:
147+
# bucket_id = record.files.bucket
148+
# bucket = Bucket.get(bucket_id.id)
149+
# for o in ObjectVersion.get_by_bucket(bucket).all():
150+
# o.remove()
151+
# o.file.delete()
152+
# RecordsBuckets.query.filter_by(
153+
# record=record.model,
154+
# bucket=bucket
155+
# ).delete()
156+
# bucket_id.remove()
157+
# db.session.commit()
158+
124159
record.update(data)
160+
sip_files_content = []
125161
if not skip_files:
126-
bucket = Bucket.create()
127-
handle_record_files(data, bucket, files, skip_files)
128-
RecordsBuckets.create(
129-
record=record.model, bucket=bucket)
130-
return record
162+
sip_files_content = handle_sip_files(
163+
files,
164+
skip_files,
165+
skip_sips
166+
)
167+
# bucket = Bucket.create()
168+
# sip_files_content = handle_record_files(
169+
# data, bucket, files, skip_files, skip_sips)
170+
# RecordsBuckets.create(
171+
# record=record.model, bucket=bucket)
172+
return record, sip_files_content
131173

132174

133175
def create_doc(data, schema):
@@ -156,6 +198,8 @@ def fixtures():
156198
@fixtures.command()
157199
@click.option('--skip-files', is_flag=True, default=False,
158200
help='Skip loading of files')
201+
@click.option('--skip-sips', is_flag=True, default=False,
202+
help='Skip create/update of SIPs')
159203
@click.option('files', '--file', '-f', multiple=True,
160204
type=click.Path(exists=True),
161205
help='Path to the file(s) to be loaded. If not provided, all'
@@ -165,8 +209,9 @@ def fixtures():
165209
@click.option('--mode', required=True, type=click.Choice(
166210
['insert', 'replace', 'insert-or-replace']))
167211
@with_appcontext
168-
def records(skip_files, files, profile, mode):
212+
def records(skip_files, skip_sips, files, profile, mode):
169213
"""Load all records."""
214+
170215
if profile:
171216
import cProfile
172217
import pstats
@@ -187,31 +232,34 @@ def records(skip_files, files, profile, mode):
187232
else:
188233
record_json = glob.glob(os.path.join(data, '*.json'))
189234

235+
190236
for filename in record_json:
191237
# name = filename.split('/')[-1]
192238
# if name.startswith('opera'):
193239
# click.echo('Skipping opera records ...')
194240
# continue
241+
195242
click.echo('Loading records from {0} ...'.format(filename))
196243
with open(filename, 'rb') as source:
197244
for data in json.load(source):
198-
199245
if not data:
200246
click.echo('IGNORING a possibly broken or corrupted '
201247
'record entry in file {0} ...'.format(filename))
202248
continue
203249

204250
files = data.get('files', [])
205251

252+
pid = None
206253
if mode == 'insert-or-replace':
207254
try:
208255
pid = PersistentIdentifier.get('recid', data['recid'])
209256
if pid:
210-
record = update_record(
211-
pid, schema, data, files, skip_files)
257+
record, sip_files_content = update_record(
258+
pid, schema, data, files, skip_files, skip_sips)
212259
action = 'updated'
213260
except PIDDoesNotExistError:
214-
record = create_record(schema, data, files, skip_files)
261+
pid, record, sip_files_content = create_record(
262+
schema, data, files, skip_files, skip_sips)
215263
action = 'inserted'
216264
elif mode == 'insert':
217265
try:
@@ -223,7 +271,8 @@ def records(skip_files, files, profile, mode):
223271
data.get('recid')), err=True)
224272
return
225273
except PIDDoesNotExistError:
226-
record = create_record(schema, data, files, skip_files)
274+
pid, record, sip_files_content = create_record(
275+
schema, data, files, skip_files, skip_sips)
227276
action = 'inserted'
228277
else:
229278
try:
@@ -234,13 +283,20 @@ def records(skip_files, files, profile, mode):
234283
'cannot replace it.'.format(
235284
data.get('recid')), err=True)
236285
return
237-
record = update_record(
238-
pid, schema, data, files, skip_files)
286+
record, sip_files_content = update_record(
287+
pid, schema, data, files, skip_files, skip_sips)
239288
action = 'updated'
240289

290+
291+
241292
if not skip_files:
242293
record.files.flush()
243294
record.commit()
295+
296+
if not skip_sips:
297+
sip_record(pid, record, sip_files_content, action)
298+
# sip_record(pid, record, ''.join(sip_files_content), action)
299+
244300
db.session.commit()
245301
click.echo(
246302
'Record recid {0} {1}.'.format(
@@ -462,3 +518,49 @@ def pids():
462518
db.session.add(record)
463519
db.session.commit()
464520
db.session.expunge_all()
521+
522+
523+
524+
@fixtures.command()
525+
@with_appcontext
526+
def sipmetadata():
527+
"""Load sipmetadata types."""
528+
data = [
529+
{
530+
"title": "CERN Open Data Record JSON",
531+
"name": "record-json",
532+
"format": "json",
533+
"schema": current_app.extensions['invenio-jsonschemas'] \
534+
.path_to_url('records/record-v1.0.0.json')
535+
},
536+
{
537+
"title": "CERN Open Data Docs JSON",
538+
"name": "docs-json",
539+
"format": "json",
540+
"schema": current_app.extensions['invenio-jsonschemas'] \
541+
.path_to_url('records/docs-v1.0.0.json')
542+
},
543+
{
544+
"title": "CERN Open Data Glossary JSON",
545+
"name": "glossary-json",
546+
"format": "json",
547+
"schema": current_app.extensions['invenio-jsonschemas'] \
548+
.path_to_url('records/glossary-term-v1.0.0.json')
549+
},
550+
{
551+
"title": "BagIt Archiver metadata",
552+
"name": "bagit",
553+
"format": "json",
554+
"schema": current_app.extensions['invenio-jsonschemas'] \
555+
.path_to_url('sipstore/bagit-v1.0.0.json')
556+
}
557+
]
558+
559+
click.secho('Loading SIP metadata types...', fg='blue')
560+
with click.progressbar(data) as types:
561+
with db.session.begin_nested():
562+
for type in types:
563+
db.session.add(SIPMetadataType(**type))
564+
db.session.commit()
565+
click.secho('SIP metadata types loaded!', fg='green')
566+
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
[
2+
{
3+
"abstract": {
4+
"description": "Sample event set from /BTag/Run2011A-12Oct2013-v1/AOD primary dataset in json format readable from the browser-based 3d event display"
5+
},
6+
"accelerator": "CERN-LHC",
7+
"authors": [
8+
{
9+
"name": "McCauley, Thomas"
10+
}
11+
],
12+
"collections": [
13+
"CMS-Derived-Datasets"
14+
],
15+
"collision_information": {
16+
"energy": "7TeV",
17+
"type": "pp"
18+
},
19+
"date_created": [
20+
"2011"
21+
],
22+
"date_published": "2016",
23+
"distribution": {
24+
"formats": [
25+
"ig"
26+
],
27+
"number_events": 25
28+
},
29+
"doi": "10.7483/OPENDATA.CMS.BK3T.NKC6",
30+
"experiment": "CMS",
31+
"files": [
32+
{
33+
"checksum": "sha1:a100d93d3b27def7954fa5827e9717ab4ec407cc",
34+
"size": 2411180,
35+
"uri": "root://eospublic.cern.ch//eos/opendata/cms/Run2011A/BTag/IG/12Oct2013-v1/BTag.ig"
36+
}
37+
],
38+
"methodology": {
39+
"description": "These files contain the objects to be displayed with the online event display. No event selection, apart accepting only the validated runs, is applied. The software to produce these files is available in:",
40+
"links": [
41+
{
42+
"recid": "550"
43+
}
44+
]
45+
},
46+
"note": {
47+
"description": "No selection or quality criteria have been applied on the individual physics objects, apart from accepting only the validated runs"
48+
},
49+
"publisher": "CERN Open Data Portal",
50+
"recid": "614",
51+
"relations": [
52+
{
53+
"doi": "10.7483/OPENDATA.CMS.N372.QF6S",
54+
"recid": "15",
55+
"title": "/BTag/Run2011A-12Oct2013-v1/AOD",
56+
"type": "isChildOf"
57+
}
58+
],
59+
"run_period": [
60+
"Run2011A"
61+
],
62+
"system_details": {
63+
"global_tag": "FT_53_LV5_AN1::All",
64+
"release": "CMSSW_5_3_30"
65+
},
66+
"title": "Event display file derived from /BTag/Run2011A-12Oct2013-v1/AOD",
67+
"type": {
68+
"primary": "Dataset",
69+
"secondary": [
70+
"Derived"
71+
]
72+
},
73+
"usage": {
74+
"description": "The data can be accessed from the file menu of the online event display",
75+
"links": [
76+
{
77+
"description": "Explore and visualise events",
78+
"url": "/visualise/events/CMS"
79+
}
80+
]
81+
}
82+
}
83+
]

0 commit comments

Comments
 (0)