Skip to content

Taxonomy

AuthorParserStage

Bases: BaseParserStage

A specific stage for searching for authors in the tag body.

Source code in ckanext/nhm/lib/taxonomy.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class AuthorParserStage(BaseParserStage):
    """
    A specific stage for searching for authors in the tag body.
    """

    def _meets_criteria(self, body, record_dict):
        """
        Ensures an author field is present in the record.
        """
        return 'scientificNameAuthorship' in record_dict.keys()

    def _extract(self, body, record_dict):
        """
        Searches for the full author string, then breaks it up into smaller pieces
        (sections in brackets, individual names) if that's not found.

        :returns: the start index of the author string if found, otherwise None
        """
        full_author = record_dict['scientificNameAuthorship']
        author_strings = [full_author] + [
            p.strip()
            for p in set(
                re.findall(r'\(([\w\s]+)\)', full_author)
                + re.findall(r'([\w.\s]+)', full_author)
            )
        ]
        for a in author_strings:
            matches = re.search(r'\s\(?{0}\)?(\s|$)'.format(re.escape(a)), body)
            return matches.start() if matches else None

BaseParserStage

Bases: object

Represents a single stage of a field parsing process.

Source code in ckanext/nhm/lib/taxonomy.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
class BaseParserStage(object):
    """
    Represents a single stage of a field parsing process.
    """

    @abc.abstractmethod
    def _meets_criteria(self, body, record_dict):
        """
        Tests to see if the item meets certain criteria.

        :param body: the tag body to search in
        :param record_dict: the full record
        :returns: boolean for pass/fail
        """
        return True

    @abc.abstractmethod
    def _extract(self, body, record_dict):
        """
        Finds an index within a string.

        :param body: the tag body
        :param record_dict: the full record
        :returns: a character index
        """
        return 0

    def evaluate(self, body, record_dict):
        """
        Checks if the item meets the criteria then returns the index.

        :param body: the tag body
        :param record_dict: the full record
        :returns: character index if criteria met, None if not
        """
        if self._meets_criteria(body, record_dict):
            return self._extract(body, record_dict)

evaluate(body, record_dict)

Checks if the item meets the criteria then returns the index.

Parameters:

Name Type Description Default
body

the tag body

required
record_dict

the full record

required

Returns:

Type Description

character index if criteria met, None if not

Source code in ckanext/nhm/lib/taxonomy.py
161
162
163
164
165
166
167
168
169
170
def evaluate(self, body, record_dict):
    """
    Checks if the item meets the criteria then returns the index.

    :param body: the tag body
    :param record_dict: the full record
    :returns: character index if criteria met, None if not
    """
    if self._meets_criteria(body, record_dict):
        return self._extract(body, record_dict)

CapitalisedParserStage

Bases: BaseParserStage

The last resort stage in the search for authors - searches for the second capitalised word in the tag body.

Source code in ckanext/nhm/lib/taxonomy.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
class CapitalisedParserStage(BaseParserStage):
    """
    The last resort stage in the search for authors - searches for the second capitalised word in
    the tag body.
    """

    def _meets_criteria(self, body, record_dict):
        """
        Checks for multiple capitalised words in the tag body.
        """
        capit = re.findall(r'([A-Z]\S*)(?:\s|$)', body)
        return len(capit) > 1

    def _extract(self, body, record_dict):
        """
        Finds the start index of the second capitalised word.
        """
        matches = [m for m in re.finditer('[A-Z]', body)]
        return matches[1].start()

SimpleFieldParserStage

Bases: BaseParserStage

A generic stage for searching for a certain field within the tag body (for the purposes of finding authors).

Source code in ckanext/nhm/lib/taxonomy.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class SimpleFieldParserStage(BaseParserStage):
    """
    A generic stage for searching for a certain field within the tag body (for the
    purposes of finding authors).
    """

    def __init__(self, field_name):
        self.field_name = field_name

    def _meets_criteria(self, body, record_dict):
        """
        Checks that the record contains a value for this field and that the value is
        present in the tag body.
        """
        return (
            self.field_name in record_dict.keys()
            and record_dict[self.field_name] in body
        )

    def _extract(self, body, record_dict):
        """
        If the value is at the end of the string, there is no point in continuing;
        otherwise, it looks for the first capitalised word after that value.

        :returns: the start index of the estimated author string if found, else None.
        """
        field_value = record_dict[self.field_name]
        if re.search(f'{re.escape(field_value)}$', body):
            return len(body)
        split_by_value = re.split(f'{re.escape(field_value)}', body, 1)
        matches = re.search(r'\(?[A-Z]\w*', split_by_value[1])
        return (
            matches.start() + len(split_by_value[0]) + len(field_value)
            if matches
            else None
        )

extract_class(record)

Extract the class value from the given record.

Parameters:

Name Type Description Default
record

the record dict

required

Returns:

Type Description

the class value, or None if it is not present

Source code in ckanext/nhm/lib/taxonomy.py
54
55
56
57
58
59
60
61
def extract_class(record):
    """
    Extract the class value from the given record.

    :param record: the record dict
    :returns: the class value, or None if it is not present
    """
    return record.get('class', None)

extract_family(record)

Extract the family value from the given record.

Parameters:

Name Type Description Default
record

the record dict

required

Returns:

Type Description

the family value, or None if it is not present

Source code in ckanext/nhm/lib/taxonomy.py
64
65
66
67
68
69
70
71
def extract_family(record):
    """
    Extract the family value from the given record.

    :param record: the record dict
    :returns: the family value, or None if it is not present
    """
    return record.get('family', None)

extract_genus(record)

Extract the genus value from the given record.

Parameters:

Name Type Description Default
record

the record dict

required

Returns:

Type Description

the genus value, or None if it is not present

Source code in ckanext/nhm/lib/taxonomy.py
74
75
76
77
78
79
80
81
def extract_genus(record):
    """
    Extract the genus value from the given record.

    :param record: the record dict
    :returns: the genus value, or None if it is not present
    """
    return record.get('genus', None)

extract_kingdom(record)

Extract the kingdom value from the given record.

Parameters:

Name Type Description Default
record

the record dict

required

Returns:

Type Description

the kingdom value, or None if it is not present

Source code in ckanext/nhm/lib/taxonomy.py
34
35
36
37
38
39
40
41
def extract_kingdom(record):
    """
    Extract the kingdom value from the given record.

    :param record: the record dict
    :returns: the kingdom value, or None if it is not present
    """
    return record.get('kingdom', None)

extract_phylum(record)

Extract the phylum value from the given record.

Parameters:

Name Type Description Default
record

the record dict

required

Returns:

Type Description

the phylum value, or None if it is not present

Source code in ckanext/nhm/lib/taxonomy.py
44
45
46
47
48
49
50
51
def extract_phylum(record):
    """
    Extract the phylum value from the given record.

    :param record: the record dict
    :returns: the phylum value, or None if it is not present
    """
    return record.get('phylum', None)

extract_ranks(record)

Extracts the values for each rank (if present) in the given record. Ranks missing from the record are not omitted.

Parameters:

Name Type Description Default
record

the record dict

required

Returns:

Type Description

the ranks as an OrderedDict in rank order.

Source code in ckanext/nhm/lib/taxonomy.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def extract_ranks(record):
    """
    Extracts the values for each rank (if present) in the given record. Ranks missing
    from the record are not omitted.

    :param record: the record dict
    :returns: the ranks as an OrderedDict in rank order.
    """
    ranks = [
        ('kingdom', extract_kingdom),
        ('phylum', extract_phylum),
        ('class', extract_class),
        ('family', extract_family),
        ('genus', extract_genus),
        ('species', extract_species),
    ]
    # extract all the rank values
    extracted_ranks = [(rank, extractor(record)) for rank, extractor in ranks]
    # filter out the missing values and return as an OrderedDict
    return OrderedDict([(rank, value) for rank, value in extracted_ranks if value])

extract_species(record)

Extract the species value from the given record. This is achieved by extracting the species from the scientificName field of the record (if there is one) and the removal of any extra details (such as the author) from this value.

Parameters:

Name Type Description Default
record

the record dict

required

Returns:

Type Description

the species value, or None if it is not present

Source code in ckanext/nhm/lib/taxonomy.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def extract_species(record):
    """
    Extract the species value from the given record. This is achieved by extracting the
    species from the `scientificName` field of the record (if there is one) and the
    removal of any extra details (such as the author) from this value.

    :param record: the record dict
    :returns: the species value, or None if it is not present
    """
    # try extracting the species from the scientific name, which starts with the species but often
    # has an author or date after it
    scientific_name = record.get('scientificName', None)
    if scientific_name:
        ix = find_author_split(scientific_name, record)
        return scientific_name[:ix].strip()

    return None

find_author_split(value, record_dict)

Given a string and a record attempts to determine where in the string an author is defined. If an author is found then this will return the index the author part starts, otherwise returns None.

Parameters:

Name Type Description Default
value

the value to search in

required
record_dict

the record dictionary to use as a supplementary source of information

required

Returns:

Type Description

the index at the start of the author part or None

Source code in ckanext/nhm/lib/taxonomy.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def find_author_split(value, record_dict):
    """
    Given a string and a record attempts to determine where in the string an author is
    defined. If an author is found then this will return the index the author part
    starts, otherwise returns None.

    :param value: the value to search in
    :param record_dict: the record dictionary to use as a supplementary source of
        information
    :returns: the index at the start of the author part or None
    """
    first_space = re.search(r'\s', value)
    if not first_space:
        return None

    evaluators = [
        AuthorParserStage(),
        SimpleFieldParserStage('specificEpithet'),
        SimpleFieldParserStage('subgenus'),
        CapitalisedParserStage(),
    ]

    ix = None
    for evaluator in evaluators:
        ix = evaluator.evaluate(value, record_dict)
        if ix:
            break

    return ix