Skip to content

strawman

dummy_df(shape, content_length=3, allowed_chars=string.ascii_letters, columns=None, seed=None)

Create a dummy DataFrame.

Parameters:

Name Type Description Default
shape Tuple[int, int]

Dimensions of the DataFrame

required
content_length int

length of the strings in the cells

3
allowed_chars str

string containing the allowed chars

ascii_letters
columns List[str]

columns names

None
seed int

seed for reproducibility

None

Returns:

Type Description
DataFrame

Randomly generated DataFrame

Raises:

Type Description
ValueError

if length of columns does not match shape

Example:

>>> from strawman import dummy_df
>>> dummy_df((10,3))
         0    1    2
    0  Ass  wEB  jEx
    1  xxD  TtW  Xzs
    2  ITh  mpj  tgy
    3  rgN  ZyW  kzR
    4  FPO  XiY  ARn
    5  gCh  ArF  QlR
    6  AxL  PMg  oMG
    7  cBo  NEN  ljX
    8  mpT  rjh  smJ
    9  lZe  Krw  TRs
Source code in /home/docs/checkouts/readthedocs.org/user_builds/strawman/envs/latest/lib/python3.10/site-packages/strawman/dummy_pandas.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def dummy_df(
    shape: Tuple[int, int],
    content_length: int = 3,
    allowed_chars: str = string.ascii_letters,
    columns: List[str] = None,
    seed: int = None,
) -> pd.DataFrame:
    """Create a dummy DataFrame.

    Args:
        shape: Dimensions of the DataFrame
        content_length: length of the strings in the cells
        allowed_chars: string containing the allowed chars
        columns: columns names
        seed: seed for reproducibility

    Returns:
        Randomly generated DataFrame

    Raises:
        ValueError: if length of columns does not match shape

    Example:

    ```pycon
    >>> from strawman import dummy_df
    >>> dummy_df((10,3))
             0    1    2
        0  Ass  wEB  jEx
        1  xxD  TtW  Xzs
        2  ITh  mpj  tgy
        3  rgN  ZyW  kzR
        4  FPO  XiY  ARn
        5  gCh  ArF  QlR
        6  AxL  PMg  oMG
        7  cBo  NEN  ljX
        8  mpT  rjh  smJ
        9  lZe  Krw  TRs
    ```
    """
    if columns and len(columns) != shape[1]:
        raise ValueError(
            f"Length of columns ({len(columns)}) does not match shape ({shape})!"
        )
    rng = _init_rng(seed=seed)
    df = pd.DataFrame(np.full(shape, np.nan), columns=columns)
    return df.applymap(
        lambda x, content_length, allowed_chars, rng: random_string_generator(
            str_size=content_length, allowed_chars=allowed_chars, rng=rng
        ),
        content_length=content_length,
        allowed_chars=allowed_chars,
        rng=rng,
    )

dummy_triples(length, num_entities=None, num_rel=None, entity_prefix='e', relation_prefix='rel', relation_triples=True, entity_ids=None, relation_ids=None, columns=None, content_length=3, allowed_chars=string.ascii_letters, seed=None)

Create dummy DataFrame in form of triples.

The default columns are ["head","relation","tail"]. Entries in the head column have an entity_prefix ("e" by default), with numbers as suffix. This is analagously done for the entries in the relation column but with the relation_prefix.

All entities show up at least once. Self-links (e.g. ["e1", "rel1", "e1"]) are avoided.

If relation_triples is False, the last column contains randomly generated strings.

Parameters:

Name Type Description Default
length int

Length of the DataFrame

required
num_entities int

Number of unique entities

None
num_rel int

Number of unique relations

None
entity_prefix str

Prefix for entity strings

'e'
relation_prefix str

Prefix for relation strings

'rel'
relation_triples bool

If True the last column contains entities, else randomly generated string

True
entity_ids List[str]

Predefined entity ids

None
relation_ids List[str]

Predefined relation ids

None
columns List[str]

Column names ["head","relation","tail"] by default

None
content_length int

Length of randomly generated string

3
allowed_chars str

Allowed characters in randomly generated string

ascii_letters
seed int

Seed for reproducibility.

None

Returns:

Type Description
DataFrame

randomly generated triple DataFrame

Raises:

Type Description
ValueError

If dummy_triples cannot be generated with the given specifications

Example:

>>> from strawman import dummy_triples
>>> df = dummy_triples(10)
>>> df
      head relation tail
0   e4     rel1   e0
1   e3     rel1   e1
2   e0     rel1   e5
3   e6     rel2   e3
4   e6     rel0   e4
5   e1     rel1   e0
6   e2     rel1   e0
7   e5     rel1   e3
8   e6     rel2   e0
9   e6     rel0   e2

Create an attribute triple DataFrame with predefined entities

>>> dummy_triples(10, entity_ids=set(df["head"]), relation_triples=False)
      head relation tail
0   e5     rel1  LOR
1   e6     rel1  rmM
2   e4     rel2  rmM
3   e0     rel2  LOR
4   e1     rel0  rmM
5   e5     rel2  Mda
6   e2     rel1  yhf
7   e3     rel2  ata
8   e5     rel2  gHk
9   e5     rel0  rmM
Source code in /home/docs/checkouts/readthedocs.org/user_builds/strawman/envs/latest/lib/python3.10/site-packages/strawman/dummy_pandas.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def dummy_triples(
    length: int,
    num_entities: int = None,
    num_rel: int = None,
    entity_prefix: str = "e",
    relation_prefix: str = "rel",
    relation_triples: bool = True,
    entity_ids: List[str] = None,
    relation_ids: List[str] = None,
    columns: List[str] = None,
    content_length: int = 3,
    allowed_chars: str = string.ascii_letters,
    seed: int = None,
) -> pd.DataFrame:
    """Create dummy DataFrame in form of triples.

    The default columns are ["head","relation","tail"].
    Entries in the head column have an `entity_prefix` ("e" by default),
    with numbers as suffix. This is analagously done for the entries
    in the relation column but with the `relation_prefix`.

    All entities show up at least once. Self-links (e.g. ["e1", "rel1", "e1"]) are avoided.

    If `relation_triples` is False, the last column contains randomly generated strings.

    Args:
        length: Length of the DataFrame
        num_entities: Number of unique entities
        num_rel: Number of unique relations
        entity_prefix: Prefix for entity strings
        relation_prefix: Prefix for relation strings
        relation_triples: If True the last column contains entities, else randomly generated string
        entity_ids: Predefined entity ids
        relation_ids: Predefined relation ids
        columns: Column names ["head","relation","tail"] by default
        content_length: Length of randomly generated string
        allowed_chars: Allowed characters in randomly generated string
        seed: Seed for reproducibility.

    Returns:
        randomly generated triple DataFrame

    Raises:
        ValueError: If dummy_triples cannot be generated with the given specifications

    Example:
    ```pycon
    >>> from strawman import dummy_triples
    >>> df = dummy_triples(10)
    >>> df
          head relation tail
    0   e4     rel1   e0
    1   e3     rel1   e1
    2   e0     rel1   e5
    3   e6     rel2   e3
    4   e6     rel0   e4
    5   e1     rel1   e0
    6   e2     rel1   e0
    7   e5     rel1   e3
    8   e6     rel2   e0
    9   e6     rel0   e2
    ```

    Create an attribute triple DataFrame with predefined entities

    ```pycon
    >>> dummy_triples(10, entity_ids=set(df["head"]), relation_triples=False)
          head relation tail
    0   e5     rel1  LOR
    1   e6     rel1  rmM
    2   e4     rel2  rmM
    3   e0     rel2  LOR
    4   e1     rel0  rmM
    5   e5     rel2  Mda
    6   e2     rel1  yhf
    7   e3     rel2  ata
    8   e5     rel2  gHk
    9   e5     rel0  rmM
    ```
    """
    _coherence_check(
        length=length,
        num_entities=num_entities,
        num_rel=num_rel,
        entity_prefix=entity_prefix,
        relation_triples=relation_triples,
        columns=columns,
        content_length=content_length,
        allowed_chars=allowed_chars,
    )

    if columns is None:
        columns = TRIPLES_COL
    if num_entities is None:
        num_entities = math.ceil(length * 0.7)
    if num_rel is None:
        minimum_rel = int(length / (num_entities * num_entities)) + 1
        num_rel = min(max(minimum_rel, int(num_entities * 0.7)), length)

    if seed is None:
        seed = np.random.default_rng().integers(0, 10000)
        logger.debug(f"Selected seed {seed}")
    rng = np.random.default_rng(seed=seed)
    head_values = (
        [entity_prefix + str(i) for i in range(num_entities)]
        if entity_ids is None
        else entity_ids
    )
    rel_values = (
        [relation_prefix + str(i) for i in range(num_rel)]
        if relation_ids is None
        else relation_ids
    )
    tail_values = (
        head_values
        if relation_triples
        else [
            random_string_generator(str_size=content_length, rng=rng)
            for _ in range(num_entities)
        ]
    )
    rows: Set[Tuple[str, str, str]] = set()
    ensured_all_entities = False
    max_tries = length * 3
    while len(rows) < length:
        if max_tries == 0:
            raise ValueError(
                "Could not create DataFrame with the given specifications..."
            )
        # ensure all entities show up
        if not ensured_all_entities:
            longest = max(len(head_values), len(rel_values))
            for head, rel in zip(
                shuffled_overlong(head_values, longest, rng),
                shuffled_overlong(rel_values, longest, rng),
            ):
                tail = _choose_tail(head=head, tail_values=tail_values, rng=rng)
                rows.add((head, rel, tail))
            ensured_all_entities = True
        else:
            head = sequence_choice(head_values, rng)
            rel, tail = _choose_rel_tail(
                head=head, rel_values=rel_values, tail_values=tail_values, rng=rng
            )
            rows.add((head, rel, tail))
        max_tries -= 1
    return pd.DataFrame(rows, columns=columns)