Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 112 additions & 1 deletion datadog_checks_base/datadog_checks/base/utils/tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)

from collections.abc import Iterator

try:
import tagger
except ImportError:
from datadog_checks.base.stubs import tagger # noqa: F401


GENERIC_TAGS = {
GENERIC_TAGS: set[str] = {
'cluster_name',
'clustername',
'cluster',
Expand All @@ -21,3 +23,112 @@
'service',
'version',
}


class TagsSet:
"""
A data structure to manage a collection of tags supporting both formats:
- key:value pairs
- standalone values (stored with empty string key)
Supports:
- add(tag_string): add a tag in 'key:value' or 'value' format
- add_tag(key, value): add a key-value pair tag
- add_standalone_tag(value): add a standalone value tag
- add_unique_tag(key, value): add a tag ensuring the key has only this value
- get_tag(key): return a set of all values for the given key
- get_tags(sort=True): return a list of formatted tag strings
- iteration: iterate over tags yielding (key, value) tuples
- remove(tag_string): remove a tag in 'key:value' or 'value' format
- remove_tag(key, value=None): remove all tags under a given key, or a specific key:value tag
- clear(): remove all tags
"""

def __init__(self) -> None:
self._data: dict[str, set[str]] = {}

def add_tag(self, key: str, value: str) -> None:
"""Add a tag with explicit key and value.
For standalone value tags, use add_standalone_tag() instead.
Raises:
ValueError: If key is empty
"""
if not key:
raise ValueError("Tag key cannot be empty. Use add_standalone_tag() for standalone values.")

if key not in self._data:
self._data[key] = set()
self._data[key].add(value)
Comment on lines +61 to +63
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can simplify this kind of operation with

Suggested change
if key not in self._data:
self._data[key] = set()
self._data[key].add(value)
self._data.setdefault(key, set()).add(value)

Or better yet, define self._data as a defaultdict(set) from collections. Every time you try to access a key that does not exist will return an empty set.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't want to use the defaultdict because of the silent KeyError that would just quietly return a default value.
For simplicity, keep it again in 3 line seemed simpler and easier to read as well.


def add_standalone_tag(self, value: str) -> None:
"""Add a standalone value tag (no key).
Standalone tags are stored internally with an empty key.
"""
if '' not in self._data:
self._data[''] = set()
self._data[''].add(value)

def add_unique_tag(self, key: str, value: str) -> None:
"""Add a tag under given key, ensuring the key has only this value.
Raises:
ValueError: If key is empty
"""
if not key:
raise ValueError("Tag key cannot be empty. Use add_standalone_tag() for standalone values.")
self._data[key] = {value}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we error here? I feel that not erroring can cause unwanted effects in which users end up overriding tags. Maybe give an option to override that by default is set to false. If override is true then we override instead of erroring.

Another question is, if the intention is to prevent the leaks we have seen in the past. Shouldn't we have a single add method the user interacts with to enforce uniqueness and only allow non-unique tags when the user specifically request it? I feel that having aadd_tag method can cause the user to go stright to that one without realizing there is a more specific add_unique_tag method if they want the tag to be unique. It is more common for a tag to be unique than not.

With a single add_tag method that enforces uniqueness, fi the user wants to add non unique tags, the code will break during testing and they will be force to add an unique flag set to False.


def get_tag(self, key: str) -> set[str]:
"""Return all values for the given key. Returns an empty set if key doesn't exist."""
return self._data.get(key, set())

def get_standalone_tags(self) -> set[str]:
return self.get_tag('')

def _get_tag_tuples(self, sort: bool = True) -> list[tuple[str, str]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are giving the user the option to sort, we should give them the option to sort ascending or descending as well. WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was waiting to check what we do with the tags arg that we pass to the base checks methods, maybe we already sort them and preprocess them somewhere.

"""Return all tags as a list of (key, value) tuples, sorted if requested."""
tags_list: list[tuple[str, str]] = []
for key, values in self._data.items():
for val in values:
tags_list.append((key, val))
Comment on lines +93 to +96
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

List comprehension is better for these kind of operations and type is inferred

tags_list = [(key, val) for key, values in self._data.items() for val in values]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think list comprehensions in such scenarios could can reduce the readability of the code, not worth saving 3 lines of code in my opinion.

return sorted(tags_list) if sort else tags_list

def get_tags(self, sort: bool = True) -> list[str]:
"""Return all tags as a list of formatted strings, sorted if requested.
Returns tags in their original format:
- 'key:value' for key-value pairs
- 'value' for standalone values
"""
tags = []
for key, value in self._get_tag_tuples(sort=False):
if key:
tags.append(f"{key}:{value}")
else:
tags.append(value)
return sorted(tags) if sort else tags

def remove_tag(self, key: str, value: str | None = None) -> None:
"""Remove tag(s) under the given key.
If value is None, remove all tags under the given key.
If value is provided, remove only the specific key:value tag.
"""
if value is None:
self._data.pop(key, None)
else:
if key in self._data:
self._data[key].discard(value)
if not self._data[key]:
del self._data[key]

def clear(self) -> None:
"""Remove all tags."""
self._data.clear()

def __iter__(self) -> Iterator[tuple[str, str]]:
"""Allow iteration over tags: for tag in ts or list(ts)."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this mean?

for tag in ts or list(ts)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was just a comment to explain how the iter function works, I'll clarify it / remove it.

return iter(self._get_tag_tuples())
185 changes: 185 additions & 0 deletions datadog_checks_base/tests/base/checks/test_tagging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# (C) Datadog, Inc. 2025-present
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)

import pytest

from datadog_checks.base.utils.tagging import TagsSet


class TestTagsSet:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get rid of the class? Testing with classes as namespaces is a left over from the unittest framework from python core but not necessary for pytest which relies on modules for namespacing.

Adding a class adds unnecessary burden to build the class and on reporting in which each line now will include the module the test is in and the name of the class. Pytest supports classes more as a backward compatibility feature to run tests written with the unittest freamework.

Some tests in the repo have classes but we are not using this anymore in newer implementations. WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I just kept the same testing style as the codebase in the base check. I'll look into it!
Are there any drawbacks of having mixed styles?

"""Test the TagsSet data structure with minimal, focused tests."""

def test_init_empty(self):
"""Test empty initialization."""
tags = TagsSet()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be a fixture and we do not need to initialize an emtpy TagSet in every test.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hot take (don’t hate me): I’m not a fan of fixtures. I just find it way clearer when we start with an empty tag set and build it up, nice and simple: Arrange, Act, Assert. Fixtures do all kinds of stuff behind the scenes, which makes it harder to follow sometimes. I’d use them here if setting up the TagsSet were more complex.

assert tags.get_tags() == []

def test_add_single_tag(self):
"""Test adding a single tag."""
tags = TagsSet()
tags.add_tag('env', 'prod')
assert tags.get_tags() == ['env:prod']

def test_add_multiple_values_same_key(self):
"""Test adding multiple values to same key."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.add_tag('env', 'dev')
assert tags.get_tags() == ['env:dev', 'env:prod']

def test_add_different_keys(self):
"""Test adding tags with different keys."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.add_tag('app', 'web')
assert tags.get_tags() == ['app:web', 'env:prod']
Comment on lines +13 to +36
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can probably parameterize all these tests, they are doing the same thing in different ways.

The same with the tests below that test the same thing with different inputs.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly, I prefer the simpler way, the duplicated code is minimal, and it’s not worth the added complexity and loss of readability that comes with using pytest.parametrize


def test_add_unique_tag_replaces(self):
"""Test add_unique_tag replaces existing values."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.add_tag('env', 'dev')
tags.add_unique_tag('env', 'test')
assert tags.get_tags() == ['env:test']

def test_add_unique_tag_new_key(self):
"""Test add_unique_tag with new key."""
tags = TagsSet()
tags.add_unique_tag('env', 'prod')
assert tags.get_tags() == ['env:prod']

def test_get_tag_empty(self):
"""Test get_tag on empty set."""
tags = TagsSet()
assert tags.get_tag('env') == set()

def test_get_tag_single_value(self):
"""Test get_tag with single value."""
tags = TagsSet()
tags.add_tag('env', 'prod')
assert tags.get_tag('env') == {'prod'}

def test_get_tag_multiple_values(self):
"""Test get_tag with multiple values."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.add_tag('env', 'dev')
assert tags.get_tag('env') == {'prod', 'dev'}

def test_get_tag_nonexistent_key(self):
"""Test get_tag with non-existent key."""
tags = TagsSet()
tags.add_tag('env', 'prod')
assert tags.get_tag('app') == set()

def test_get_tags_unsorted(self):
"""Test get_tags with sort=False."""
tags = TagsSet()
tags.add_tag('b', '1')
tags.add_tag('a', '2')
result = tags.get_tags(sort=False)
assert set(result) == {'a:2', 'b:1'}

def test_get_tags_sorted(self):
"""Test get_tags with sort=True."""
tags = TagsSet()
tags.add_tag('b', '1')
tags.add_tag('a', '2')
assert tags.get_tags(sort=True) == ['a:2', 'b:1']

def test_remove_tag_all_values(self):
"""Test removing all values for a key."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.add_tag('env', 'dev')
tags.remove_tag('env')
assert tags.get_tags() == []

def test_remove_tag_specific_value(self):
"""Test removing specific value."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.add_tag('env', 'dev')
tags.remove_tag('env', 'dev')
assert tags.get_tags() == ['env:prod']

def test_remove_tag_nonexistent_key(self):
"""Test removing non-existent key doesn't error."""
tags = TagsSet()
tags.remove_tag('env') # Should not raise
assert tags.get_tags() == []

def test_remove_tag_nonexistent_value(self):
"""Test removing non-existent value doesn't error."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.remove_tag('env', 'dev') # Should not raise
assert tags.get_tags() == ['env:prod']

def test_clear_with_tags(self):
"""Test clear removes all tags."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.add_tag('app', 'web')
tags.clear()
assert tags.get_tags() == []

def test_iterator_empty(self):
"""Test iteration on empty set."""
tags = TagsSet()
assert list(tags) == []

def test_iterator_multiple_tags(self):
"""Test iteration yields tuples."""
tags = TagsSet()
tags.add_tag('env', 'prod')
tags.add_tag('app', 'web')
assert list(tags) == [('app', 'web'), ('env', 'prod')]

def test_empty_key_raises_error(self):
"""Test that empty key raises ValueError."""
tags = TagsSet()
with pytest.raises(ValueError, match="Tag key cannot be empty"):
tags.add_tag('', 'value')

def test_empty_key_add_unique_raises_error(self):
"""Test that empty key in add_unique_tag raises ValueError."""
tags = TagsSet()
with pytest.raises(ValueError, match="Tag key cannot be empty"):
tags.add_unique_tag('', 'value')

def test_special_chars_colon(self):
"""Test key/value with colons."""
tags = TagsSet()
tags.add_tag('url', 'http://example.com')
assert tags.get_tags() == ['url:http://example.com']

def test_sorting_by_key_then_value(self):
"""Test sorting is by key first, then value."""
tags = TagsSet()
tags.add_tag('a', '2')
tags.add_tag('a', '1')
tags.add_tag('b', '1')
assert tags.get_tags() == ['a:1', 'a:2', 'b:1']

# Tests for both tag formats (key:value and standalone value)
def test_add_standalone_tag(self):
"""Test adding standalone value tags."""
tags = TagsSet()
tags.add_standalone_tag('production')
tags.add_standalone_tag('critical')
assert sorted(tags.get_tags()) == ['critical', 'production']

def test_standalone_tags_use_empty_key(self):
"""Test that standalone tags are stored with empty key."""
tags = TagsSet()
tags.add_standalone_tag('production')
tags.add_standalone_tag('staging')
# Verify they're stored under empty key
assert tags.get_standalone_tags() == {'production', 'staging'}

tags.add_tag('env', 'prod')
tags.add_tag('env', 'staging')
# Verify they appear as standalone in output
assert sorted(tags.get_tags()) == ['env:prod', 'env:staging', 'production', 'staging']
Loading