strawman

`dummy_df(shape, content_length=3, allowed_chars=string.ascii_letters, columns=None, seed=None)`

Create a dummy DataFrame.

Parameters:

Name	Type	Description	Default
`shape`	`Tuple[int, int]`	Dimensions of the DataFrame	required
`content_length`	`int`	length of the strings in the cells	`3`
`allowed_chars`	`str`	string containing the allowed chars	`ascii_letters`
`columns`	`List[str]`	columns names	`None`
`seed`	`int`	seed for reproducibility	`None`

Returns:

Type	Description
`DataFrame`	Randomly generated DataFrame

Raises:

Type	Description
`ValueError`	if length of columns does not match shape

Example:

>>> from strawman import dummy_df
>>> dummy_df((10,3))
         0    1    2
    0  Ass  wEB  jEx
    1  xxD  TtW  Xzs
    2  ITh  mpj  tgy
    3  rgN  ZyW  kzR
    4  FPO  XiY  ARn
    5  gCh  ArF  QlR
    6  AxL  PMg  oMG
    7  cBo  NEN  ljX
    8  mpT  rjh  smJ
    9  lZe  Krw  TRs

Source code in

/home/docs/checkouts/readthedocs.org/user_builds/strawman/envs/latest/lib/python3.10/site-packages/strawman/dummy_pandas.py

def dummy_df(
    shape: Tuple[int, int],
    content_length: int = 3,
    allowed_chars: str = string.ascii_letters,
    columns: List[str] = None,
    seed: int = None,
) -> pd.DataFrame:
    """Create a dummy DataFrame.

    Args:
        shape: Dimensions of the DataFrame
        content_length: length of the strings in the cells
        allowed_chars: string containing the allowed chars
        columns: columns names
        seed: seed for reproducibility

    Returns:
        Randomly generated DataFrame

    Raises:
        ValueError: if length of columns does not match shape

    Example:

    ```pycon
    >>> from strawman import dummy_df
    >>> dummy_df((10,3))
             0    1    2
        0  Ass  wEB  jEx
        1  xxD  TtW  Xzs
        2  ITh  mpj  tgy
        3  rgN  ZyW  kzR
        4  FPO  XiY  ARn
        5  gCh  ArF  QlR
        6  AxL  PMg  oMG
        7  cBo  NEN  ljX
        8  mpT  rjh  smJ
        9  lZe  Krw  TRs
    ```
    """
    if columns and len(columns) != shape[1]:
        raise ValueError(
            f"Length of columns ({len(columns)}) does not match shape ({shape})!"
        )
    rng = _init_rng(seed=seed)
    df = pd.DataFrame(np.full(shape, np.nan), columns=columns)
    return df.applymap(
        lambda x, content_length, allowed_chars, rng: random_string_generator(
            str_size=content_length, allowed_chars=allowed_chars, rng=rng
        ),
        content_length=content_length,
        allowed_chars=allowed_chars,
        rng=rng,
    )

`dummy_triples(length, num_entities=None, num_rel=None, entity_prefix='e', relation_prefix='rel', relation_triples=True, entity_ids=None, relation_ids=None, columns=None, content_length=3, allowed_chars=string.ascii_letters, seed=None)`

Create dummy DataFrame in form of triples.

The default columns are ["head","relation","tail"]. Entries in the head column have an entity_prefix ("e" by default), with numbers as suffix. This is analagously done for the entries in the relation column but with the relation_prefix.

All entities show up at least once. Self-links (e.g. ["e1", "rel1", "e1"]) are avoided.

If relation_triples is False, the last column contains randomly generated strings.

Parameters:

Name	Type	Description	Default
`length`	`int`	Length of the DataFrame	required
`num_entities`	`int`	Number of unique entities	`None`
`num_rel`	`int`	Number of unique relations	`None`
`entity_prefix`	`str`	Prefix for entity strings	`'e'`
`relation_prefix`	`str`	Prefix for relation strings	`'rel'`
`relation_triples`	`bool`	If True the last column contains entities, else randomly generated string	`True`
`entity_ids`	`List[str]`	Predefined entity ids	`None`
`relation_ids`	`List[str]`	Predefined relation ids	`None`
`columns`	`List[str]`	Column names ["head","relation","tail"] by default	`None`
`content_length`	`int`	Length of randomly generated string	`3`
`allowed_chars`	`str`	Allowed characters in randomly generated string	`ascii_letters`
`seed`	`int`	Seed for reproducibility.	`None`

Returns:

Type	Description
`DataFrame`	randomly generated triple DataFrame

Raises:

Type	Description
`ValueError`	If dummy_triples cannot be generated with the given specifications

Example:

>>> from strawman import dummy_triples
>>> df = dummy_triples(10)
>>> df
      head relation tail
0   e4     rel1   e0
1   e3     rel1   e1
2   e0     rel1   e5
3   e6     rel2   e3
4   e6     rel0   e4
5   e1     rel1   e0
6   e2     rel1   e0
7   e5     rel1   e3
8   e6     rel2   e0
9   e6     rel0   e2

Create an attribute triple DataFrame with predefined entities

>>> dummy_triples(10, entity_ids=set(df["head"]), relation_triples=False)
      head relation tail
0   e5     rel1  LOR
1   e6     rel1  rmM
2   e4     rel2  rmM
3   e0     rel2  LOR
4   e1     rel0  rmM
5   e5     rel2  Mda
6   e2     rel1  yhf
7   e3     rel2  ata
8   e5     rel2  gHk
9   e5     rel0  rmM

Source code in

/home/docs/checkouts/readthedocs.org/user_builds/strawman/envs/latest/lib/python3.10/site-packages/strawman/dummy_pandas.py

def dummy_triples(
    length: int,
    num_entities: int = None,
    num_rel: int = None,
    entity_prefix: str = "e",
    relation_prefix: str = "rel",
    relation_triples: bool = True,
    entity_ids: List[str] = None,
    relation_ids: List[str] = None,
    columns: List[str] = None,
    content_length: int = 3,
    allowed_chars: str = string.ascii_letters,
    seed: int = None,
) -> pd.DataFrame:
    """Create dummy DataFrame in form of triples.

    The default columns are ["head","relation","tail"].
    Entries in the head column have an `entity_prefix` ("e" by default),
    with numbers as suffix. This is analagously done for the entries
    in the relation column but with the `relation_prefix`.

    All entities show up at least once. Self-links (e.g. ["e1", "rel1", "e1"]) are avoided.

    If `relation_triples` is False, the last column contains randomly generated strings.

    Args:
        length: Length of the DataFrame
        num_entities: Number of unique entities
        num_rel: Number of unique relations
        entity_prefix: Prefix for entity strings
        relation_prefix: Prefix for relation strings
        relation_triples: If True the last column contains entities, else randomly generated string
        entity_ids: Predefined entity ids
        relation_ids: Predefined relation ids
        columns: Column names ["head","relation","tail"] by default
        content_length: Length of randomly generated string
        allowed_chars: Allowed characters in randomly generated string
        seed: Seed for reproducibility.

    Returns:
        randomly generated triple DataFrame

    Raises:
        ValueError: If dummy_triples cannot be generated with the given specifications

    Example:
    ```pycon
    >>> from strawman import dummy_triples
    >>> df = dummy_triples(10)
    >>> df
          head relation tail
    0   e4     rel1   e0
    1   e3     rel1   e1
    2   e0     rel1   e5
    3   e6     rel2   e3
    4   e6     rel0   e4
    5   e1     rel1   e0
    6   e2     rel1   e0
    7   e5     rel1   e3
    8   e6     rel2   e0
    9   e6     rel0   e2
    ```

    Create an attribute triple DataFrame with predefined entities

    ```pycon
    >>> dummy_triples(10, entity_ids=set(df["head"]), relation_triples=False)
          head relation tail
    0   e5     rel1  LOR
    1   e6     rel1  rmM
    2   e4     rel2  rmM
    3   e0     rel2  LOR
    4   e1     rel0  rmM
    5   e5     rel2  Mda
    6   e2     rel1  yhf
    7   e3     rel2  ata
    8   e5     rel2  gHk
    9   e5     rel0  rmM
    ```
    """
    _coherence_check(
        length=length,
        num_entities=num_entities,
        num_rel=num_rel,
        entity_prefix=entity_prefix,
        relation_triples=relation_triples,
        columns=columns,
        content_length=content_length,
        allowed_chars=allowed_chars,
    )

    if columns is None:
        columns = TRIPLES_COL
    if num_entities is None:
        num_entities = math.ceil(length * 0.7)
    if num_rel is None:
        minimum_rel = int(length / (num_entities * num_entities)) + 1
        num_rel = min(max(minimum_rel, int(num_entities * 0.7)), length)

    if seed is None:
        seed = np.random.default_rng().integers(0, 10000)
        logger.debug(f"Selected seed {seed}")
    rng = np.random.default_rng(seed=seed)
    head_values = (
        [entity_prefix + str(i) for i in range(num_entities)]
        if entity_ids is None
        else entity_ids
    )
    rel_values = (
        [relation_prefix + str(i) for i in range(num_rel)]
        if relation_ids is None
        else relation_ids
    )
    tail_values = (
        head_values
        if relation_triples
        else [
            random_string_generator(str_size=content_length, rng=rng)
            for _ in range(num_entities)
        ]
    )
    rows: Set[Tuple[str, str, str]] = set()
    ensured_all_entities = False
    max_tries = length * 3
    while len(rows) < length:
        if max_tries == 0:
            raise ValueError(
                "Could not create DataFrame with the given specifications..."
            )
        # ensure all entities show up
        if not ensured_all_entities:
            longest = max(len(head_values), len(rel_values))
            for head, rel in zip(
                shuffled_overlong(head_values, longest, rng),
                shuffled_overlong(rel_values, longest, rng),
            ):
                tail = _choose_tail(head=head, tail_values=tail_values, rng=rng)
                rows.add((head, rel, tail))
            ensured_all_entities = True
        else:
            head = sequence_choice(head_values, rng)
            rel, tail = _choose_rel_tail(
                head=head, rel_values=rel_values, tail_values=tail_values, rng=rng
            )
            rows.add((head, rel, tail))
        max_tries -= 1
    return pd.DataFrame(rows, columns=columns)