From 14e6e5cf6c95a87ba5f11f62fe5430e21d45e5f3 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Mon, 19 Feb 2024 13:40:57 +0200 Subject: [PATCH 01/14] add NormalizeFieldKeys middleware --- bibtexparser/middlewares/__init__.py | 1 + bibtexparser/middlewares/fieldkeys.py | 37 ++++++++ tests/middleware_tests/test_fieldkeys.py | 113 +++++++++++++++++++++++ 3 files changed, 151 insertions(+) create mode 100644 bibtexparser/middlewares/fieldkeys.py create mode 100644 tests/middleware_tests/test_fieldkeys.py diff --git a/bibtexparser/middlewares/__init__.py b/bibtexparser/middlewares/__init__.py index 1f5a938..f1cdb05 100644 --- a/bibtexparser/middlewares/__init__.py +++ b/bibtexparser/middlewares/__init__.py @@ -2,6 +2,7 @@ AddEnclosingMiddleware, RemoveEnclosingMiddleware, ) +from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys from bibtexparser.middlewares.interpolate import ResolveStringReferencesMiddleware from bibtexparser.middlewares.latex_encoding import ( LatexDecodingMiddleware, diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py new file mode 100644 index 0000000..c5b5f2a --- /dev/null +++ b/bibtexparser/middlewares/fieldkeys.py @@ -0,0 +1,37 @@ +import logging +from typing import Collection, Dict, List, Set, Union + +from bibtexparser.library import Library +from bibtexparser.model import Block, Entry, Field + +from .middleware import BlockMiddleware + +class NormalizeFieldKeys(BlockMiddleware): + """Normalize field keys to lowercase. + + In case of conflicts (e.g. both 'author' and 'Author' exist in the same entry), + a warning is emitted, and the last value wins. + + Some other middlewares, such as `SeparateCoAuthors`, assume lowercase key names. + """ + + def __init__(self, allow_inplace_modification: bool = True): + super().__init__(allow_inplace_modification=allow_inplace_modification, + allow_parallel_execution=True) + + def transform_entry(self, entry: Entry, library: "Library") -> Union[Block, Collection[Block], None]: + seen_normalized_keys: Set[str] = set() + new_fields_dict: Dict[str, Field] = {} + for field in entry.fields: + normalized_key: str = field.key.lower() + if normalized_key in seen_normalized_keys: + # TODO: Log the full entry, too, to help the user find where the failure occurred? + logging.warning(f"NormalizeFieldKeys: duplicate normalized key '{normalized_key}' (original '{field.key}'); overriding previous value") + seen_normalized_keys.add(normalized_key) + field.key = normalized_key + new_fields_dict[normalized_key] = field # This implements "last one wins", but otherwise preserves insertion order. + + new_fields: List[Field] = list(new_fields_dict.values()) + entry.fields = new_fields + + return entry diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py new file mode 100644 index 0000000..70bec8e --- /dev/null +++ b/tests/middleware_tests/test_fieldkeys.py @@ -0,0 +1,113 @@ +from bibtexparser.middlewares.enclosing import RemoveEnclosingMiddleware +from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys +from bibtexparser.splitter import Splitter + +test_bibtex_string_lowercasekeys = """ +@article{smith2022, + author = "Smith, J.", + title = "A Test Article", + journal = "J. of Testing", + month = "jan", + year = "2022" +} + +@book{doe2021, + author = "Doe, J.", + title = "A Test Book", + publisher = "Test Pub.", + year = "2021", + month = apr +} + +@inproceedings{jones2023, + author = "Jones, R.", + title = "A Test Conf. Paper", + booktitle = "Proc. of the Intl. Test Conf.", + year = "2023", + month = 8 +} +""" + +test_bibtex_string_capitalizedkeys = """ +@article{smith2022, + Author = "Smith, J.", + Title = "A Test Article", + Journal = "J. of Testing", + Month = "jan", + Year = "2022" +} + +@book{doe2021, + Author = "Doe, J.", + Title = "A Test Book", + Publisher = "Test Pub.", + Year = "2021", + Month = apr +} + +@inproceedings{jones2023, + Author = "Jones, R.", + Title = "A Test Conf. Paper", + Booktitle = "Proc. of the Intl. Test Conf.", + Year = "2023", + Month = 8 +} +""" + +def test_normalize_lowercase(): + original_library = Splitter(test_bibtex_string_lowercasekeys).split() + + new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( + original_library + ) + + assert "author" in new_library.entries_dict["smith2022"] + assert new_library.entries_dict["smith2022"]["author"] == '"Smith, J."' + assert "author" in new_library.entries_dict["doe2021"] + assert new_library.entries_dict["doe2021"]["author"] == '"Doe, J."' + assert "author" in new_library.entries_dict["jones2023"] + assert new_library.entries_dict["jones2023"]["author"] == '"Jones, R."' + + # Test the same after enclosing is removed + no_enclosing_library = RemoveEnclosingMiddleware( + allow_inplace_modification=False + ).transform(original_library) + new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( + no_enclosing_library + ) + + assert "author" in new_library.entries_dict["smith2022"] + assert new_library.entries_dict["smith2022"]["author"] == "Smith, J." + assert "author" in new_library.entries_dict["doe2021"] + assert new_library.entries_dict["doe2021"]["author"] == "Doe, J." + assert "author" in new_library.entries_dict["jones2023"] + assert new_library.entries_dict["jones2023"]["author"] == "Jones, R." + +def test_normalize_capitalized(): + original_library = Splitter(test_bibtex_string_capitalizedkeys).split() + + new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( + original_library + ) + + assert "author" in new_library.entries_dict["smith2022"] + assert new_library.entries_dict["smith2022"]["author"] == '"Smith, J."' + assert "author" in new_library.entries_dict["doe2021"] + assert new_library.entries_dict["doe2021"]["author"] == '"Doe, J."' + assert "author" in new_library.entries_dict["jones2023"] + assert new_library.entries_dict["jones2023"]["author"] == '"Jones, R."' + + # Test the same after enclosing is removed + no_enclosing_library = RemoveEnclosingMiddleware( + allow_inplace_modification=False + ).transform(original_library) + new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( + no_enclosing_library + ) + + assert "author" in new_library.entries_dict["smith2022"] + assert new_library.entries_dict["smith2022"]["author"] == "Smith, J." + assert "author" in new_library.entries_dict["doe2021"] + assert new_library.entries_dict["doe2021"]["author"] == "Doe, J." + assert "author" in new_library.entries_dict["jones2023"] + assert new_library.entries_dict["jones2023"]["author"] == "Jones, R." From e3e73b0619189d7ea5d520895115d48295a521e0 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Mon, 19 Feb 2024 15:10:25 +0200 Subject: [PATCH 02/14] log the key of the offending entry --- bibtexparser/middlewares/fieldkeys.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index c5b5f2a..cf830e9 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -25,8 +25,7 @@ def transform_entry(self, entry: Entry, library: "Library") -> Union[Block, Coll for field in entry.fields: normalized_key: str = field.key.lower() if normalized_key in seen_normalized_keys: - # TODO: Log the full entry, too, to help the user find where the failure occurred? - logging.warning(f"NormalizeFieldKeys: duplicate normalized key '{normalized_key}' (original '{field.key}'); overriding previous value") + logging.warning(f"NormalizeFieldKeys: in entry '{entry.key}': duplicate normalized key '{normalized_key}' (original '{field.key}'); overriding previous value") seen_normalized_keys.add(normalized_key) field.key = normalized_key new_fields_dict[normalized_key] = field # This implements "last one wins", but otherwise preserves insertion order. From b09afe4f363605f5530f1faf01176e5e73142b40 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 23 Feb 2024 13:39:22 +0200 Subject: [PATCH 03/14] reimplement test_fieldkeys using Entry --- tests/middleware_tests/test_fieldkeys.py | 112 +++++++++++------------ 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py index 70bec8e..b29fde5 100644 --- a/tests/middleware_tests/test_fieldkeys.py +++ b/tests/middleware_tests/test_fieldkeys.py @@ -1,62 +1,63 @@ +from bibtexparser import Library from bibtexparser.middlewares.enclosing import RemoveEnclosingMiddleware from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys -from bibtexparser.splitter import Splitter +from bibtexparser.model import Entry, Field + +test_entry_1a = Entry(entry_type="article", + key="smith2022", + fields=[Field(key="author", value='"Smith, J."'), + Field(key="title", value='"A Test Article"'), + Field(key="journal", value='"J. of Testing"'), + Field(key="month", value='"jan"'), + Field(key="year", value='"2022"')]) +test_entry_2a = Entry(entry_type="book", + key="doe2021", + fields=[Field(key="author", value='"Doe, J."'), + Field(key="title", value='"A Test Book"'), + Field(key="publisher", value='"Test Pub."'), + Field(key="year", value='"2021"'), + Field(key="month", value='apr')]) +test_entry_3a = Entry(entry_type="inproceedings", + key="jones2023", + fields=[Field(key="author", value='"Jones, R."'), + Field(key="title", value='"A Test Conf. Paper"'), + Field(key="booktitle", value='"Proc. of the Intl. Test Conf."'), + Field(key="year", value='"2023"'), + Field(key="month", value='8')]) +test_library_lowercasekeys = Library() +test_library_lowercasekeys.add(test_entry_1a) +test_library_lowercasekeys.add(test_entry_2a) +test_library_lowercasekeys.add(test_entry_3a) + +test_entry_1b = Entry(entry_type="article", + key="smith2022", + fields=[Field(key="author", value='"Smith, J."'), + Field(key="title", value='"A Test Article"'), + Field(key="journal", value='"J. of Testing"'), + Field(key="month", value='"jan"'), + Field(key="year", value='"2022"')]) +test_entry_2b = Entry(entry_type="book", + key="doe2021", + fields=[Field(key="author", value='"Doe, J."'), + Field(key="title", value='"A Test Book"'), + Field(key="publisher", value='"Test Pub."'), + Field(key="year", value='"2021"'), + Field(key="month", value='apr')]) +test_entry_3b = Entry(entry_type="inproceedings", + key="jones2023", + fields=[Field(key="author", value='"Jones, R."'), + Field(key="title", value='"A Test Conf. Paper"'), + Field(key="booktitle", value='"Proc. of the Intl. Test Conf."'), + Field(key="year", value='"2023"'), + Field(key="month", value='8')]) +test_library_capitalizedkeys = Library() +test_library_capitalizedkeys.add(test_entry_1b) +test_library_capitalizedkeys.add(test_entry_2b) +test_library_capitalizedkeys.add(test_entry_3b) -test_bibtex_string_lowercasekeys = """ -@article{smith2022, - author = "Smith, J.", - title = "A Test Article", - journal = "J. of Testing", - month = "jan", - year = "2022" -} - -@book{doe2021, - author = "Doe, J.", - title = "A Test Book", - publisher = "Test Pub.", - year = "2021", - month = apr -} - -@inproceedings{jones2023, - author = "Jones, R.", - title = "A Test Conf. Paper", - booktitle = "Proc. of the Intl. Test Conf.", - year = "2023", - month = 8 -} -""" - -test_bibtex_string_capitalizedkeys = """ -@article{smith2022, - Author = "Smith, J.", - Title = "A Test Article", - Journal = "J. of Testing", - Month = "jan", - Year = "2022" -} - -@book{doe2021, - Author = "Doe, J.", - Title = "A Test Book", - Publisher = "Test Pub.", - Year = "2021", - Month = apr -} - -@inproceedings{jones2023, - Author = "Jones, R.", - Title = "A Test Conf. Paper", - Booktitle = "Proc. of the Intl. Test Conf.", - Year = "2023", - Month = 8 -} -""" def test_normalize_lowercase(): - original_library = Splitter(test_bibtex_string_lowercasekeys).split() - + original_library = test_library_lowercasekeys new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( original_library ) @@ -84,8 +85,7 @@ def test_normalize_lowercase(): assert new_library.entries_dict["jones2023"]["author"] == "Jones, R." def test_normalize_capitalized(): - original_library = Splitter(test_bibtex_string_capitalizedkeys).split() - + original_library = test_library_capitalizedkeys new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( original_library ) From 106ec2e72034f0f0ea61d35a61a1af20e2dcaa74 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 23 Feb 2024 13:41:16 +0200 Subject: [PATCH 04/14] fieldkeys, test_fieldkeys: reformat with `black` --- bibtexparser/middlewares/fieldkeys.py | 21 +++-- tests/middleware_tests/test_fieldkeys.py | 109 ++++++++++++++--------- 2 files changed, 82 insertions(+), 48 deletions(-) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index cf830e9..e36c1ca 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -6,6 +6,7 @@ from .middleware import BlockMiddleware + class NormalizeFieldKeys(BlockMiddleware): """Normalize field keys to lowercase. @@ -16,19 +17,27 @@ class NormalizeFieldKeys(BlockMiddleware): """ def __init__(self, allow_inplace_modification: bool = True): - super().__init__(allow_inplace_modification=allow_inplace_modification, - allow_parallel_execution=True) - - def transform_entry(self, entry: Entry, library: "Library") -> Union[Block, Collection[Block], None]: + super().__init__( + allow_inplace_modification=allow_inplace_modification, + allow_parallel_execution=True, + ) + + def transform_entry( + self, entry: Entry, library: "Library" + ) -> Union[Block, Collection[Block], None]: seen_normalized_keys: Set[str] = set() new_fields_dict: Dict[str, Field] = {} for field in entry.fields: normalized_key: str = field.key.lower() if normalized_key in seen_normalized_keys: - logging.warning(f"NormalizeFieldKeys: in entry '{entry.key}': duplicate normalized key '{normalized_key}' (original '{field.key}'); overriding previous value") + logging.warning( + f"NormalizeFieldKeys: in entry '{entry.key}': duplicate normalized key '{normalized_key}' (original '{field.key}'); overriding previous value" + ) seen_normalized_keys.add(normalized_key) field.key = normalized_key - new_fields_dict[normalized_key] = field # This implements "last one wins", but otherwise preserves insertion order. + new_fields_dict[normalized_key] = ( + field # This implements "last one wins", but otherwise preserves insertion order. + ) new_fields: List[Field] = list(new_fields_dict.values()) entry.fields = new_fields diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py index b29fde5..c253ec1 100644 --- a/tests/middleware_tests/test_fieldkeys.py +++ b/tests/middleware_tests/test_fieldkeys.py @@ -3,53 +3,77 @@ from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys from bibtexparser.model import Entry, Field -test_entry_1a = Entry(entry_type="article", - key="smith2022", - fields=[Field(key="author", value='"Smith, J."'), - Field(key="title", value='"A Test Article"'), - Field(key="journal", value='"J. of Testing"'), - Field(key="month", value='"jan"'), - Field(key="year", value='"2022"')]) -test_entry_2a = Entry(entry_type="book", - key="doe2021", - fields=[Field(key="author", value='"Doe, J."'), - Field(key="title", value='"A Test Book"'), - Field(key="publisher", value='"Test Pub."'), - Field(key="year", value='"2021"'), - Field(key="month", value='apr')]) -test_entry_3a = Entry(entry_type="inproceedings", - key="jones2023", - fields=[Field(key="author", value='"Jones, R."'), - Field(key="title", value='"A Test Conf. Paper"'), - Field(key="booktitle", value='"Proc. of the Intl. Test Conf."'), - Field(key="year", value='"2023"'), - Field(key="month", value='8')]) +test_entry_1a = Entry( + entry_type="article", + key="smith2022", + fields=[ + Field(key="author", value='"Smith, J."'), + Field(key="title", value='"A Test Article"'), + Field(key="journal", value='"J. of Testing"'), + Field(key="month", value='"jan"'), + Field(key="year", value='"2022"'), + ], +) +test_entry_2a = Entry( + entry_type="book", + key="doe2021", + fields=[ + Field(key="author", value='"Doe, J."'), + Field(key="title", value='"A Test Book"'), + Field(key="publisher", value='"Test Pub."'), + Field(key="year", value='"2021"'), + Field(key="month", value="apr"), + ], +) +test_entry_3a = Entry( + entry_type="inproceedings", + key="jones2023", + fields=[ + Field(key="author", value='"Jones, R."'), + Field(key="title", value='"A Test Conf. Paper"'), + Field(key="booktitle", value='"Proc. of the Intl. Test Conf."'), + Field(key="year", value='"2023"'), + Field(key="month", value="8"), + ], +) test_library_lowercasekeys = Library() test_library_lowercasekeys.add(test_entry_1a) test_library_lowercasekeys.add(test_entry_2a) test_library_lowercasekeys.add(test_entry_3a) -test_entry_1b = Entry(entry_type="article", - key="smith2022", - fields=[Field(key="author", value='"Smith, J."'), - Field(key="title", value='"A Test Article"'), - Field(key="journal", value='"J. of Testing"'), - Field(key="month", value='"jan"'), - Field(key="year", value='"2022"')]) -test_entry_2b = Entry(entry_type="book", - key="doe2021", - fields=[Field(key="author", value='"Doe, J."'), - Field(key="title", value='"A Test Book"'), - Field(key="publisher", value='"Test Pub."'), - Field(key="year", value='"2021"'), - Field(key="month", value='apr')]) -test_entry_3b = Entry(entry_type="inproceedings", - key="jones2023", - fields=[Field(key="author", value='"Jones, R."'), - Field(key="title", value='"A Test Conf. Paper"'), - Field(key="booktitle", value='"Proc. of the Intl. Test Conf."'), - Field(key="year", value='"2023"'), - Field(key="month", value='8')]) +test_entry_1b = Entry( + entry_type="article", + key="smith2022", + fields=[ + Field(key="author", value='"Smith, J."'), + Field(key="title", value='"A Test Article"'), + Field(key="journal", value='"J. of Testing"'), + Field(key="month", value='"jan"'), + Field(key="year", value='"2022"'), + ], +) +test_entry_2b = Entry( + entry_type="book", + key="doe2021", + fields=[ + Field(key="author", value='"Doe, J."'), + Field(key="title", value='"A Test Book"'), + Field(key="publisher", value='"Test Pub."'), + Field(key="year", value='"2021"'), + Field(key="month", value="apr"), + ], +) +test_entry_3b = Entry( + entry_type="inproceedings", + key="jones2023", + fields=[ + Field(key="author", value='"Jones, R."'), + Field(key="title", value='"A Test Conf. Paper"'), + Field(key="booktitle", value='"Proc. of the Intl. Test Conf."'), + Field(key="year", value='"2023"'), + Field(key="month", value="8"), + ], +) test_library_capitalizedkeys = Library() test_library_capitalizedkeys.add(test_entry_1b) test_library_capitalizedkeys.add(test_entry_2b) @@ -84,6 +108,7 @@ def test_normalize_lowercase(): assert "author" in new_library.entries_dict["jones2023"] assert new_library.entries_dict["jones2023"]["author"] == "Jones, R." + def test_normalize_capitalized(): original_library = test_library_capitalizedkeys new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( From 757bfc9945807a924d202099a7a800a401ca72ac Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 23 Feb 2024 15:17:13 +0200 Subject: [PATCH 05/14] add comment --- bibtexparser/middlewares/fieldkeys.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index e36c1ca..bdc2a91 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -29,6 +29,22 @@ def transform_entry( new_fields_dict: Dict[str, Field] = {} for field in entry.fields: normalized_key: str = field.key.lower() + # Since we always mutate `field` here, we will lose the original key after we normalize it. + # Upon a key name conflict, checking here allows us to emit a helpful warning that makes it + # easy to locate the offending key even if the entry is long (some online services include + # many optional fields into their BibTeX outputs). + # + # The other option to produce this helpful error message would be to collect a copy of the + # original key names, but almost always (no conflicts), they are not needed, so collecting + # them would only cause unnecessary pressure on the garbage collector. + # + # Alternatively, we could just report `entry.key`, not which key or keys were the offending ones, + # which runs faster, but is not as helpful for the user. + # + # Note that maximizing speed here is mainly important in applications where BibTeX parsing takes + # much of the run time, such as reference database converters. In most other applications, + # whatever the application does with the imported BibTeX data typically takes orders of magnitude + # longer than the BibTeX import. For such applications, the better warning message is more important. if normalized_key in seen_normalized_keys: logging.warning( f"NormalizeFieldKeys: in entry '{entry.key}': duplicate normalized key '{normalized_key}' (original '{field.key}'); overriding previous value" From 71ee1dee4e3dd4e3c89e74d5bd2d12d05c68f190 Mon Sep 17 00:00:00 2001 From: Tom de Geus Date: Fri, 23 Feb 2024 14:33:07 +0100 Subject: [PATCH 06/14] Simplifying tests --- bibtexparser/middlewares/fieldkeys.py | 9 +- tests/middleware_tests/test_fieldkeys.py | 176 +++++++---------------- 2 files changed, 59 insertions(+), 126 deletions(-) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index bdc2a91..d44940a 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -47,13 +47,14 @@ def transform_entry( # longer than the BibTeX import. For such applications, the better warning message is more important. if normalized_key in seen_normalized_keys: logging.warning( - f"NormalizeFieldKeys: in entry '{entry.key}': duplicate normalized key '{normalized_key}' (original '{field.key}'); overriding previous value" + f"NormalizeFieldKeys: in entry '{entry.key}': " + + f"duplicate normalized key '{normalized_key}' " + + f"(original '{field.key}'); overriding previous value" ) seen_normalized_keys.add(normalized_key) field.key = normalized_key - new_fields_dict[normalized_key] = ( - field # This implements "last one wins", but otherwise preserves insertion order. - ) + # "last one wins", but otherwise preserve insertion order + new_fields_dict[normalized_key] = field new_fields: List[Field] = list(new_fields_dict.values()) entry.fields = new_fields diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py index c253ec1..2eb8125 100644 --- a/tests/middleware_tests/test_fieldkeys.py +++ b/tests/middleware_tests/test_fieldkeys.py @@ -2,137 +2,69 @@ from bibtexparser.middlewares.enclosing import RemoveEnclosingMiddleware from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys from bibtexparser.model import Entry, Field +import re +import pytest -test_entry_1a = Entry( - entry_type="article", - key="smith2022", - fields=[ - Field(key="author", value='"Smith, J."'), - Field(key="title", value='"A Test Article"'), - Field(key="journal", value='"J. of Testing"'), - Field(key="month", value='"jan"'), - Field(key="year", value='"2022"'), - ], -) -test_entry_2a = Entry( - entry_type="book", - key="doe2021", - fields=[ - Field(key="author", value='"Doe, J."'), - Field(key="title", value='"A Test Book"'), - Field(key="publisher", value='"Test Pub."'), - Field(key="year", value='"2021"'), - Field(key="month", value="apr"), - ], -) -test_entry_3a = Entry( - entry_type="inproceedings", - key="jones2023", - fields=[ - Field(key="author", value='"Jones, R."'), - Field(key="title", value='"A Test Conf. Paper"'), - Field(key="booktitle", value='"Proc. of the Intl. Test Conf."'), - Field(key="year", value='"2023"'), - Field(key="month", value="8"), - ], -) -test_library_lowercasekeys = Library() -test_library_lowercasekeys.add(test_entry_1a) -test_library_lowercasekeys.add(test_entry_2a) -test_library_lowercasekeys.add(test_entry_3a) +entries = { + "article": { + "author": '"Smith, J."', + "title": '"A Test Article"', + "journal": '"J. of Testing"', + "month": '"jan"', + "year": '"2022"', + }, + "book": { + "author": '"Doe, J."', + "title": '"A Test Book"', + "publisher": '"Test Pub."', + "year": '"2021"', + "month": "apr", + }, + "inproceedings": { + "author": '"Jones, R."', + "title": '"A Test Conf. Paper"', + "booktitle": '"Proc. of the Intl. Test Conf."', + "year": '"2023"', + "month": "8", + }, +} -test_entry_1b = Entry( - entry_type="article", - key="smith2022", - fields=[ - Field(key="author", value='"Smith, J."'), - Field(key="title", value='"A Test Article"'), - Field(key="journal", value='"J. of Testing"'), - Field(key="month", value='"jan"'), - Field(key="year", value='"2022"'), - ], -) -test_entry_2b = Entry( - entry_type="book", - key="doe2021", - fields=[ - Field(key="author", value='"Doe, J."'), - Field(key="title", value='"A Test Book"'), - Field(key="publisher", value='"Test Pub."'), - Field(key="year", value='"2021"'), - Field(key="month", value="apr"), - ], -) -test_entry_3b = Entry( - entry_type="inproceedings", - key="jones2023", - fields=[ - Field(key="author", value='"Jones, R."'), - Field(key="title", value='"A Test Conf. Paper"'), - Field(key="booktitle", value='"Proc. of the Intl. Test Conf."'), - Field(key="year", value='"2023"'), - Field(key="month", value="8"), - ], -) -test_library_capitalizedkeys = Library() -test_library_capitalizedkeys.add(test_entry_1b) -test_library_capitalizedkeys.add(test_entry_2b) -test_library_capitalizedkeys.add(test_entry_3b) +ref = Library() +for i, (entry_type, fields) in enumerate(entries.items()): + f = [Field(key=k, value=v) for k, v in fields.items()] + ref.add(Entry(entry_type=entry_type, key=f"foo{i}", fields=f)) -def test_normalize_lowercase(): - original_library = test_library_lowercasekeys - new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( - original_library - ) - - assert "author" in new_library.entries_dict["smith2022"] - assert new_library.entries_dict["smith2022"]["author"] == '"Smith, J."' - assert "author" in new_library.entries_dict["doe2021"] - assert new_library.entries_dict["doe2021"]["author"] == '"Doe, J."' - assert "author" in new_library.entries_dict["jones2023"] - assert new_library.entries_dict["jones2023"]["author"] == '"Jones, R."' +def test_normalize_fieldkeys(): + """ + Check library with uppercase field keys. + """ - # Test the same after enclosing is removed - no_enclosing_library = RemoveEnclosingMiddleware( - allow_inplace_modification=False - ).transform(original_library) - new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( - no_enclosing_library - ) + lib = Library() + for i, (entry_type, fields) in enumerate(entries.items()): + f = [Field(key=k, value=v) for k, v in fields.items()] + lib.add(Entry(entry_type=entry_type, key=f"foo{i}", fields=f)) - assert "author" in new_library.entries_dict["smith2022"] - assert new_library.entries_dict["smith2022"]["author"] == "Smith, J." - assert "author" in new_library.entries_dict["doe2021"] - assert new_library.entries_dict["doe2021"]["author"] == "Doe, J." - assert "author" in new_library.entries_dict["jones2023"] - assert new_library.entries_dict["jones2023"]["author"] == "Jones, R." + lib = NormalizeFieldKeys().transform(lib) + for key in lib.entries_dict: + assert lib.entries_dict[key] == ref.entries_dict[key] -def test_normalize_capitalized(): - original_library = test_library_capitalizedkeys - new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( - original_library - ) - assert "author" in new_library.entries_dict["smith2022"] - assert new_library.entries_dict["smith2022"]["author"] == '"Smith, J."' - assert "author" in new_library.entries_dict["doe2021"] - assert new_library.entries_dict["doe2021"]["author"] == '"Doe, J."' - assert "author" in new_library.entries_dict["jones2023"] - assert new_library.entries_dict["jones2023"]["author"] == '"Jones, R."' +def test_normalize_fieldkeys_force_last(caplog): + """ + Check library with uppercase field keys and duplicate normalized keys. + """ + lib = Library() + for i, (entry_type, fields) in enumerate(entries.items()): + f = [Field(key=k.lower(), value="foo") for k in fields] + f += [Field(key=k.upper(), value=v) for k, v in fields.items()] + lib.add(Entry(entry_type=entry_type, key=f"foo{i}", fields=f)) - # Test the same after enclosing is removed - no_enclosing_library = RemoveEnclosingMiddleware( - allow_inplace_modification=False - ).transform(original_library) - new_library = NormalizeFieldKeys(allow_inplace_modification=False).transform( - no_enclosing_library + lib = NormalizeFieldKeys().transform(lib) + assert re.match( + r"(WARNING\s*)(\w*\:\w*\.py\:[0-9]*\s*)(NormalizeFieldKeys)(.*)", caplog.text ) - assert "author" in new_library.entries_dict["smith2022"] - assert new_library.entries_dict["smith2022"]["author"] == "Smith, J." - assert "author" in new_library.entries_dict["doe2021"] - assert new_library.entries_dict["doe2021"]["author"] == "Doe, J." - assert "author" in new_library.entries_dict["jones2023"] - assert new_library.entries_dict["jones2023"]["author"] == "Jones, R." + for key in lib.entries_dict: + assert lib.entries_dict[key] == ref.entries_dict[key] From 4b328ba438f598dc2af8074f598698200049e96f Mon Sep 17 00:00:00 2001 From: Tom de Geus Date: Fri, 23 Feb 2024 14:38:30 +0100 Subject: [PATCH 07/14] Simplifying comment --- bibtexparser/middlewares/fieldkeys.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index d44940a..331749f 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -29,22 +29,11 @@ def transform_entry( new_fields_dict: Dict[str, Field] = {} for field in entry.fields: normalized_key: str = field.key.lower() - # Since we always mutate `field` here, we will lose the original key after we normalize it. - # Upon a key name conflict, checking here allows us to emit a helpful warning that makes it - # easy to locate the offending key even if the entry is long (some online services include - # many optional fields into their BibTeX outputs). - # - # The other option to produce this helpful error message would be to collect a copy of the - # original key names, but almost always (no conflicts), they are not needed, so collecting - # them would only cause unnecessary pressure on the garbage collector. - # - # Alternatively, we could just report `entry.key`, not which key or keys were the offending ones, - # which runs faster, but is not as helpful for the user. - # - # Note that maximizing speed here is mainly important in applications where BibTeX parsing takes - # much of the run time, such as reference database converters. In most other applications, - # whatever the application does with the imported BibTeX data typically takes orders of magnitude - # longer than the BibTeX import. For such applications, the better warning message is more important. + # if the normalized key is already present, apply "last one wins" + # otherwise preserve insertion order + # if a key is overwritten, emit a detailed warning + # if performance is a concern, we could emit a warning with only {entry.key} + # to remove "seen_normalized_keys" and this if statement if normalized_key in seen_normalized_keys: logging.warning( f"NormalizeFieldKeys: in entry '{entry.key}': " @@ -53,7 +42,6 @@ def transform_entry( ) seen_normalized_keys.add(normalized_key) field.key = normalized_key - # "last one wins", but otherwise preserve insertion order new_fields_dict[normalized_key] = field new_fields: List[Field] = list(new_fields_dict.values()) From 6ad724fa16326c7f7015f861669a0508e88fa84a Mon Sep 17 00:00:00 2001 From: Tom de Geus Date: Fri, 23 Feb 2024 14:42:37 +0100 Subject: [PATCH 08/14] linting --- tests/middleware_tests/test_fieldkeys.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py index 2eb8125..b3db827 100644 --- a/tests/middleware_tests/test_fieldkeys.py +++ b/tests/middleware_tests/test_fieldkeys.py @@ -1,9 +1,11 @@ +import re + +import pytest + from bibtexparser import Library from bibtexparser.middlewares.enclosing import RemoveEnclosingMiddleware from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys from bibtexparser.model import Entry, Field -import re -import pytest entries = { "article": { From 9eed680032e2ccc6d8ab00a0bee88afb8e0e7a8a Mon Sep 17 00:00:00 2001 From: Tom de Geus Date: Fri, 23 Feb 2024 16:40:34 +0100 Subject: [PATCH 09/14] Fixing linting and return typo --- bibtexparser/middlewares/fieldkeys.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index 331749f..6c8e714 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -22,9 +22,8 @@ def __init__(self, allow_inplace_modification: bool = True): allow_parallel_execution=True, ) - def transform_entry( - self, entry: Entry, library: "Library" - ) -> Union[Block, Collection[Block], None]: + # docstr-coverage: inherited + def transform_entry(self, entry: Entry, library: "Library") -> Entry: seen_normalized_keys: Set[str] = set() new_fields_dict: Dict[str, Field] = {} for field in entry.fields: From 8301b0dbc4bee1743e31583741ce4bc1f3573124 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Thu, 29 Feb 2024 14:52:02 +0200 Subject: [PATCH 10/14] fix docstring --- tests/middleware_tests/test_fieldkeys.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py index b3db827..a495cf9 100644 --- a/tests/middleware_tests/test_fieldkeys.py +++ b/tests/middleware_tests/test_fieldkeys.py @@ -39,7 +39,7 @@ def test_normalize_fieldkeys(): """ - Check library with uppercase field keys. + Check library with lowercase field keys. """ lib = Library() From 53bce98da128bf5eb557fc0f2481a8efa40d13af Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Thu, 29 Feb 2024 14:52:13 +0200 Subject: [PATCH 11/14] use meaningful names --- tests/middleware_tests/test_fieldkeys.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py index a495cf9..dadf9bd 100644 --- a/tests/middleware_tests/test_fieldkeys.py +++ b/tests/middleware_tests/test_fieldkeys.py @@ -34,7 +34,7 @@ ref = Library() for i, (entry_type, fields) in enumerate(entries.items()): f = [Field(key=k, value=v) for k, v in fields.items()] - ref.add(Entry(entry_type=entry_type, key=f"foo{i}", fields=f)) + ref.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f)) def test_normalize_fieldkeys(): @@ -45,7 +45,7 @@ def test_normalize_fieldkeys(): lib = Library() for i, (entry_type, fields) in enumerate(entries.items()): f = [Field(key=k, value=v) for k, v in fields.items()] - lib.add(Entry(entry_type=entry_type, key=f"foo{i}", fields=f)) + lib.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f)) lib = NormalizeFieldKeys().transform(lib) @@ -59,9 +59,9 @@ def test_normalize_fieldkeys_force_last(caplog): """ lib = Library() for i, (entry_type, fields) in enumerate(entries.items()): - f = [Field(key=k.lower(), value="foo") for k in fields] + f = [Field(key=k.lower(), value="dummyvalue") for k in fields] f += [Field(key=k.upper(), value=v) for k, v in fields.items()] - lib.add(Entry(entry_type=entry_type, key=f"foo{i}", fields=f)) + lib.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f)) lib = NormalizeFieldKeys().transform(lib) assert re.match( From ac825c3d0cb9df0d1e61504139206d83292aa3d5 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Thu, 29 Feb 2024 14:57:21 +0200 Subject: [PATCH 12/14] remove unused imports --- bibtexparser/middlewares/fieldkeys.py | 4 ++-- tests/middleware_tests/test_fieldkeys.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index 6c8e714..d5b5cfd 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -1,8 +1,8 @@ import logging -from typing import Collection, Dict, List, Set, Union +from typing import Dict, List, Set from bibtexparser.library import Library -from bibtexparser.model import Block, Entry, Field +from bibtexparser.model import Entry, Field from .middleware import BlockMiddleware diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py index dadf9bd..deebd20 100644 --- a/tests/middleware_tests/test_fieldkeys.py +++ b/tests/middleware_tests/test_fieldkeys.py @@ -3,7 +3,6 @@ import pytest from bibtexparser import Library -from bibtexparser.middlewares.enclosing import RemoveEnclosingMiddleware from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys from bibtexparser.model import Entry, Field From 34abbac5254de50ce400820dd0165d08588079df Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Thu, 29 Feb 2024 14:57:32 +0200 Subject: [PATCH 13/14] PEP8 --- bibtexparser/middlewares/fieldkeys.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index d5b5cfd..79b737f 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -35,9 +35,9 @@ def transform_entry(self, entry: Entry, library: "Library") -> Entry: # to remove "seen_normalized_keys" and this if statement if normalized_key in seen_normalized_keys: logging.warning( - f"NormalizeFieldKeys: in entry '{entry.key}': " - + f"duplicate normalized key '{normalized_key}' " - + f"(original '{field.key}'); overriding previous value" + f"NormalizeFieldKeys: in entry '{entry.key}': " + + f"duplicate normalized key '{normalized_key}' " + + f"(original '{field.key}'); overriding previous value" ) seen_normalized_keys.add(normalized_key) field.key = normalized_key From 7134a56dd9392999a2e9603a77ed8bb2e560da87 Mon Sep 17 00:00:00 2001 From: Tom de Geus Date: Thu, 29 Feb 2024 14:18:26 +0100 Subject: [PATCH 14/14] pre-commit --- bibtexparser/middlewares/fieldkeys.py | 13 ++++++++----- tests/middleware_tests/test_fieldkeys.py | 9 +++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py index 79b737f..edb5a7e 100644 --- a/bibtexparser/middlewares/fieldkeys.py +++ b/bibtexparser/middlewares/fieldkeys.py @@ -1,8 +1,11 @@ import logging -from typing import Dict, List, Set +from typing import Dict +from typing import List +from typing import Set from bibtexparser.library import Library -from bibtexparser.model import Entry, Field +from bibtexparser.model import Entry +from bibtexparser.model import Field from .middleware import BlockMiddleware @@ -35,9 +38,9 @@ def transform_entry(self, entry: Entry, library: "Library") -> Entry: # to remove "seen_normalized_keys" and this if statement if normalized_key in seen_normalized_keys: logging.warning( - f"NormalizeFieldKeys: in entry '{entry.key}': " + - f"duplicate normalized key '{normalized_key}' " + - f"(original '{field.key}'); overriding previous value" + f"NormalizeFieldKeys: in entry '{entry.key}': " + + f"duplicate normalized key '{normalized_key}' " + + f"(original '{field.key}'); overriding previous value" ) seen_normalized_keys.add(normalized_key) field.key = normalized_key diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py index deebd20..acff042 100644 --- a/tests/middleware_tests/test_fieldkeys.py +++ b/tests/middleware_tests/test_fieldkeys.py @@ -1,10 +1,9 @@ import re -import pytest - from bibtexparser import Library from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys -from bibtexparser.model import Entry, Field +from bibtexparser.model import Entry +from bibtexparser.model import Field entries = { "article": { @@ -63,9 +62,7 @@ def test_normalize_fieldkeys_force_last(caplog): lib.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f)) lib = NormalizeFieldKeys().transform(lib) - assert re.match( - r"(WARNING\s*)(\w*\:\w*\.py\:[0-9]*\s*)(NormalizeFieldKeys)(.*)", caplog.text - ) + assert re.match(r"(WARNING\s*)(\w*\:\w*\.py\:[0-9]*\s*)(NormalizeFieldKeys)(.*)", caplog.text) for key in lib.entries_dict: assert lib.entries_dict[key] == ref.entries_dict[key]