Skip to content

Specimen records

ObjectSerializer

Bases: RDFSerializer

Simple subclass of the RDFSerializer class that just simply adds a record serializer method.

Source code in ckanext/nhm/dcat/specimen_records.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
class ObjectSerializer(RDFSerializer):
    """
    Simple subclass of the RDFSerializer class that just simply adds a record serializer
    method.
    """

    serializer_kwargs = {
        'json-ld': {
            # this compacts the json output so that the namespaces are declared at the start in a
            # @context field rather than when each value is included
            'auto_compact': True
        },
        'pretty-xml': {
            # this prevents the serializer nesting resource tags inside each other thus forcing the
            # metadata, data and any image tags to appear individually
            'max_depth': 1
        },
    }

    def serialize_record(
        self, record: Record, output_format: str = 'xml', version: Optional[int] = None
    ):
        """
        Given a record dict, returns an RDF serialization.

        The serialization format can be defined using the `_format` parameter. It must be one of the
        ones supported by RDFLib, defaults to `xml`.

        Returns a string with the serialized dataset
        """
        rdflib_format = url_to_rdflib_format(output_format)
        builder = RecordGraphBuilder(record, Namespaces(self.g), output_format, version)

        for triple in builder:
            # the builder is allowed to yield duplicate triples so using add here just defers the
            # handling of these to the triplestore itself, which it handles fine. We could use set
            # to be more proactive with de-duplication, however the replace part of the set function
            # removes all previously added subject-predicate pairs and is therefore too aggressive
            # (i.e. set(x, y, a) followed by set(x, y, b) would result in (x, y, b) in the
            # triplestore and the (x, y, a) triple would be removed)
            self.g.add(triple)

        return self.g.serialize(
            format=rdflib_format, **self.serializer_kwargs.get(rdflib_format, {})
        )

serialize_record(record, output_format='xml', version=None)

Given a record dict, returns an RDF serialization.

The serialization format can be defined using the _format parameter. It must be one of the ones supported by RDFLib, defaults to xml.

Returns a string with the serialized dataset

Source code in ckanext/nhm/dcat/specimen_records.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def serialize_record(
    self, record: Record, output_format: str = 'xml', version: Optional[int] = None
):
    """
    Given a record dict, returns an RDF serialization.

    The serialization format can be defined using the `_format` parameter. It must be one of the
    ones supported by RDFLib, defaults to `xml`.

    Returns a string with the serialized dataset
    """
    rdflib_format = url_to_rdflib_format(output_format)
    builder = RecordGraphBuilder(record, Namespaces(self.g), output_format, version)

    for triple in builder:
        # the builder is allowed to yield duplicate triples so using add here just defers the
        # handling of these to the triplestore itself, which it handles fine. We could use set
        # to be more proactive with de-duplication, however the replace part of the set function
        # removes all previously added subject-predicate pairs and is therefore too aggressive
        # (i.e. set(x, y, a) followed by set(x, y, b) would result in (x, y, b) in the
        # triplestore and the (x, y, a) triple would be removed)
        self.g.add(triple)

    return self.g.serialize(
        format=rdflib_format, **self.serializer_kwargs.get(rdflib_format, {})
    )

RecordGraphBuilder

Bases: object

Class that can generate the triples necessary to represent a record in rdf format.

Source code in ckanext/nhm/dcat/specimen_records.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
class RecordGraphBuilder(object):
    """
    Class that can generate the triples necessary to represent a record in rdf format.
    """

    def __init__(
        self,
        record: Record,
        namespaces: Namespaces,
        output_format: str,
        version: Optional[int],
    ):
        """
        :param record: a Record object
        :param namespaces: the namespaces object to use
        :param output_format: the format the generated graph will be output in. This will be used to
                              create the metadata URI (i.e. object_url + .{output_format})
        :param version: the version of the record in question, or None if there is no version
        """
        self.record = record
        self.namespaces = namespaces
        self.output_format = output_format
        self.version = version

        # figure out the rounded version of the record
        self.rounded_version = toolkit.get_action('vds_version_round')(
            {},
            {
                'resource_id': self.record.resource_id,
                'version': self.version,
            },
        )

        # figure out the object URI for the record (possibly with version)
        if version is None:
            self.base_object_uri = object_uri(record.data['occurrenceID'])
        else:
            self.base_object_uri = object_uri(
                record.data['occurrenceID'], version=self.rounded_version
            )
        # this will be used as the subject for any record data triples
        self.record_ref = URIRef(self.base_object_uri)

        # grab the gbif version of the record if we can
        self.gbif_record = self._get_gbif_record()

    def _get_gbif_record(self):
        """
        Retrieve the GBIF representation of the record if we can.

        :returns: the GBIF record dict or None if we couldn't get it or didn't have a
            GBIF ID associated with the record
        """
        gbif_id = self.record.data.get('gbifID', None)

        if gbif_id is not None:
            try:
                context = {'ignore_auth': True}
                data_dict = {'gbif_id': gbif_id}
                return toolkit.get_action('gbif_record_show')(context, data_dict)
            except toolkit.ObjectNotFound:
                pass
        return None

    def __iter__(self):
        """
        Iterating over this object will yield the triples that represent the record.

        :returns: yields 3-tuples
        """
        triple_generators = [
            self._metadata(),
            self._cetaf_cspp(),
            self._gbif(),
            self._images(),
            self._dwc(),
            self._version_info(),
            self._extras(),
        ]

        for triple in itertools.chain(*triple_generators):
            # if the object part of the triple is None, don't yield it. This is tied to the
            # _get_value function on this class which essentially provides a way of avoiding having
            # to use lots of if checks (i.e. if field has value, yield)
            if triple[2] is not None:
                yield triple

    def _get_value(self, field, source=None):
        """
        Retrieve a value from the given source and yield it wrapped in a Literal. The
        default source is the self.record.data dict. This function works in conjunction
        with the __iter__ function above which filters out triples containing a None as
        the object (the 3rd value).

        :param field: the field to get the value of
        :param source: the source dict to retrieve the field's value from
        :returns: the value wrapped in a Literal or None if the field doesn't exist on
            the source
        """
        if source is None:
            source = self.record.data
        value = source.get(field, None)
        if value is not None:
            return Literal(value)
        else:
            return None

    def _metadata(self):
        """
        Yields triples which describe this RDF output, i.e. a meta-metadata description.

        :returns: yields triples
        """
        metadata_uri = '{}.{}'.format(self.base_object_uri, self.output_format)
        meta_ref = URIRef(metadata_uri)
        yield meta_ref, self.namespaces.dc.subject, self.record_ref
        yield (
            meta_ref,
            self.namespaces.dc.creator,
            Literal('Natural History Museum, London'),
        )
        yield meta_ref, self.namespaces.dc.created, Literal(datetime.now())

    def _cetaf_cspp(self):
        """
        Yields the triples required for the record data to ensure we conform to the
        CETAF CSPP recommendations, for more info see here:
        https://cetafidentifiers.biowikifarm.net/wiki/CSPP.

        :returns: yields triples
        """
        yield (
            self.record_ref,
            self.namespaces.dc.title,
            self._get_value('scientificName'),
        )
        yield self.record_ref, self.namespaces.dc.type, Literal('Specimen')
        yield (
            self.record_ref,
            self.namespaces.dwc.scientificName,
            self._get_value('scientificName'),
        )
        yield self.record_ref, self.namespaces.dwc.family, self._get_value('family')

        # find the previous determinations for this record and yield them as the
        # previousIdentifications term, ignoring the current determination the record is filed as
        determination_names = self.record.data.get('determinationNames', [])
        filed_as = self.record.data.get('determinationFiledAs', [])
        if determination_names and filed_as:
            names = (
                name
                for name, filed in zip(determination_names, filed_as)
                if filed == 'No'
            )
            yield (
                self.record_ref,
                self.namespaces.dwc.previousIdentifications,
                Literal(as_dwc_list(names)),
            )

        yield (
            self.record_ref,
            self.namespaces.dwc.fieldNumber,
            self._get_value('fieldNumber'),
        )
        yield (
            self.record_ref,
            self.namespaces.dwc.recordedBy,
            self._get_value('recordedBy'),
        )

        # if there is associated media, yield it as a list
        images = self.record.images
        if images:
            value = as_dwc_list(image.url for image in images)
            yield self.record_ref, self.namespaces.dwc.associatedMedia, Literal(value)

        yield (
            self.record_ref,
            self.namespaces.dwc.decimalLatitude,
            self._get_value('decimalLatitude'),
        )
        yield (
            self.record_ref,
            self.namespaces.dwc.decimalLongitude,
            self._get_value('decimalLongitude'),
        )

        # if there's a GBIF record, see if we can yield a country code for the record from it
        if self.gbif_record is not None:
            yield (
                self.record_ref,
                self.namespaces.dwc.countryCode,
                self._get_value('countryCode', source=self.gbif_record),
            )

        if self.record.data.get('created', None) is not None:
            yield (
                self.record_ref,
                self.namespaces.dc.created,
                Literal(self.record.data['created']),
            )
        yield self.record_ref, self.namespaces.dc.publisher, URIRef('https://nhm.ac.uk')

    def _images(self):
        """
        Yields triples describing the images associated with this record, if there are
        any. Each image is connected to the record through a FOAF depicts field
        connection and then the image is described in its own set of triples where the
        image URI is used as the subject.

        :returns: yields triples
        """
        for image in self.record.images:
            image_uri = URIRef(image.url)
            yield image_uri, self.namespaces.rdf.type, self.namespaces.foaf.Image

            yield image_uri, self.namespaces.dc.title, Literal(image.title)
            yield image_uri, self.namespaces.cc.license, URIRef(image.license_url)
            yield image_uri, self.namespaces.dc.RightsStatement, Literal(image.rights)
            # although the actual image could be something else, the preview will always be a jpeg
            yield image_uri, self.namespaces.dc.format, Literal('image/jpeg')
            # add link from image to object
            yield image_uri, self.namespaces.foaf.depicts, self.record_ref
            # add a link from the object to the image
            yield self.record_ref, self.namespaces.foaf.depiction, image_uri
            # add a thumbnail link
            if image.is_mss_image:
                yield (
                    image_uri,
                    self.namespaces.foaf.thumbnail,
                    URIRef(image.thumbnail_url),
                )

    def _gbif(self):
        """
        Yields triples describing the record using the GBIF record data associated with
        it.

        :returns: yields triples
        """
        if self.gbif_record is not None:
            # assert equivalence with the GBIF record
            yield (
                self.record_ref,
                self.namespaces.owl.sameAs,
                URIRef(
                    'https://www.gbif.org/occurrence/{}'.format(
                        self.gbif_record['gbifID']
                    )
                ),
            )
            # if we have a GBIF country code, add it
            yield (
                self.record_ref,
                self.namespaces.dwc.countryCode,
                self._get_value('countryCode', source=self.gbif_record),
            )

    def _dwc(self):
        """
        Yields triples describing the record using DWC (DarWin Core) terms.

        :returns: yields triples
        """
        yield (
            self.record_ref,
            self.namespaces.dc.identifier,
            Literal(self.record.data['occurrenceID']),
        )

        dwc_terms_dict = dwc_terms(self.record.data.keys())

        groups_to_skip = {'dynamicProperties'}
        terms_to_skip = {'associatedMedia', 'created', 'modified'}
        for group, terms in dwc_terms_dict.items():
            if group in groups_to_skip:
                continue

            for uri, term in terms.items():
                if term in terms_to_skip:
                    continue

                if self.gbif_record is not None:
                    gbif_key = self.gbif_record.get(f'{term}Key')
                    if gbif_key:
                        gbif_uri = URIRef(f'http://www.gbif.org/species/{gbif_key}')
                        # add the GBIF species URI with label
                        yield (
                            gbif_uri,
                            self.namespaces.rdfs.label,
                            Literal(self.record.data.get(term)),
                        )
                        # and associated our specimen object's DWC term with the GBIF URI
                        yield (
                            self.record_ref,
                            getattr(self.namespaces.dwc, term),
                            gbif_uri,
                        )
                else:
                    yield (
                        self.record_ref,
                        getattr(self.namespaces.dwc, term),
                        Literal(self.record.data.get(term)),
                    )

        # retrieve the dynamic properties and yield them as one JSON dump
        dynamic_properties_dict = {}
        for properties in dwc_terms_dict.get('dynamicProperties', {}).values():
            for dynamic_property in properties:
                if dynamic_property == 'created':
                    continue
                dynamic_properties_dict[dynamic_property] = self.record.data.get(
                    dynamic_property
                )
        if dynamic_properties_dict:
            yield (
                self.record_ref,
                self.namespaces.dwc.dynamicProperties,
                Literal(json.dumps(dynamic_properties_dict)),
            )

        # yield the associatedMedia term as a pipe-separated list of image URIs
        images = self.record.images
        if images:
            yield (
                self.record_ref,
                self.namespaces.dwc.associatedMedia,
                Literal(as_dwc_list(image.url for image in images)),
            )

        if self.record.data.get('created', None) is not None:
            # yield the created date in the correct format
            yield (
                self.record_ref,
                self.namespaces.dc.created,
                Literal(self.record.data['created']),
            )

        if self.record.data.get('modified', None) is not None:
            # yield the modified date in the correct format
            yield (
                self.record_ref,
                self.namespaces.dwc.modified,
                Literal(self.record.data['modified']),
            )

    def _version_info(self):
        """
        Yield simple version information about the record.

        :returns: yields triples
        """
        yield (
            self.record_ref,
            self.namespaces.owl.versionInfo,
            Literal(self.rounded_version),
        )

        if self.version is None or self.version > self.rounded_version:
            # if there is no version given or the version requested is beyond the latest version the
            # data we're using is the same as the latest version's data, yield a same as to show
            # this
            yield (
                self.record_ref,
                self.namespaces.owl.sameAs,
                URIRef(
                    object_uri(
                        self.record.data['occurrenceID'], version=self.rounded_version
                    )
                ),
            )

    def _extras(self):
        """
        Yields some additional triples that don't really fit under any of the other
        existing method groupings.

        :returns: yields triples
        """
        yield (
            self.record_ref,
            self.namespaces.aiiso.Department,
            Literal(get_department(self.record.data['collectionCode'])),
        )
        yield (
            self.record_ref,
            self.namespaces.aiiso.Division,
            self._get_value('subDepartment'),
        )

        yield (
            self.record_ref,
            self.namespaces.void.inDataset,
            URIRef(dataset_uri({'id': self.record.package_id}) + '#dataset'),
        )

__init__(record, namespaces, output_format, version)

Parameters:

Name Type Description Default
record Record

a Record object

required
namespaces Namespaces

the namespaces object to use

required
output_format str

the format the generated graph will be output in. This will be used to create the metadata URI (i.e. object_url + .{output_format})

required
version Optional[int]

the version of the record in question, or None if there is no version

required
Source code in ckanext/nhm/dcat/specimen_records.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def __init__(
    self,
    record: Record,
    namespaces: Namespaces,
    output_format: str,
    version: Optional[int],
):
    """
    :param record: a Record object
    :param namespaces: the namespaces object to use
    :param output_format: the format the generated graph will be output in. This will be used to
                          create the metadata URI (i.e. object_url + .{output_format})
    :param version: the version of the record in question, or None if there is no version
    """
    self.record = record
    self.namespaces = namespaces
    self.output_format = output_format
    self.version = version

    # figure out the rounded version of the record
    self.rounded_version = toolkit.get_action('vds_version_round')(
        {},
        {
            'resource_id': self.record.resource_id,
            'version': self.version,
        },
    )

    # figure out the object URI for the record (possibly with version)
    if version is None:
        self.base_object_uri = object_uri(record.data['occurrenceID'])
    else:
        self.base_object_uri = object_uri(
            record.data['occurrenceID'], version=self.rounded_version
        )
    # this will be used as the subject for any record data triples
    self.record_ref = URIRef(self.base_object_uri)

    # grab the gbif version of the record if we can
    self.gbif_record = self._get_gbif_record()

__iter__()

Iterating over this object will yield the triples that represent the record.

Returns:

Type Description

yields 3-tuples

Source code in ckanext/nhm/dcat/specimen_records.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def __iter__(self):
    """
    Iterating over this object will yield the triples that represent the record.

    :returns: yields 3-tuples
    """
    triple_generators = [
        self._metadata(),
        self._cetaf_cspp(),
        self._gbif(),
        self._images(),
        self._dwc(),
        self._version_info(),
        self._extras(),
    ]

    for triple in itertools.chain(*triple_generators):
        # if the object part of the triple is None, don't yield it. This is tied to the
        # _get_value function on this class which essentially provides a way of avoiding having
        # to use lots of if checks (i.e. if field has value, yield)
        if triple[2] is not None:
            yield triple