python - 如何在假设数据框中实现依赖列

Question

我正在使用假设数据框来实现一个数据框，其中 start_time 和 end_time 是两列。这是一个大块：

import hypothesis.strategies as st
import logging
import datetime

from hypothesis import given
from hypothesis.extra.pandas import column, data_frames, range_indexes

current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)

datetime_st = st.integers(
    min_value=(current_time + datetime.timedelta(hours=4)).timestamp(),
    max_value=(current_time + datetime.timedelta(hours=20)).timestamp(),
)

df_columns = {
    # other fields omitted
    "start_time": {"elements": datetime_st, "unique": False},
    "end_time": {"elements": datetime_st, "unique": False},
}
test_dfs = data_frames(
    index=range_indexes(min_size=20, max_size=100),
    columns=[column(key, **value) for key, value in df_columns.items()],
)

@given(df=test_dfs)
def test_hyothesis(df):
    logging.info(df)
    assert 1

我无法找到一个解决方案来断言每个 start_time 应该大于其相应的 end_time 至少 delta。我已经尝试过composite了，但我不确定如何在dataframes.

有没有办法在初始化 start_time 和 end_time 时强制执行增量作为规则？

score 2 · Accepted Answer

这是一种生成两个时间戳列的数据帧的方法，其中第一个和第二个之间的差异至少为 3600 秒（或其他一些时间量）。我正在st.flatmap为此使用。

import hypothesis.strategies as st
from hypothesis.extra.pandas import column, data_frames, range_indexes, columns

current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp()

MIN_DIFF_SECONDS = 3600

two_timestamps_with_diff = st.integers(
    min_value = current_time + 3600 * 4, 
    max_value = current_time + 4600 * 20).flatmap(
    lambda n: st.tuples(
       st.integers(min_value = n, max_value=n), 
       st.integers(min_value = n + MIN_DIFF_SECONDS, max_value = n + 3600*10)
   ))

# sample code to examine the results of this strategy
# for _ in range(10):
#     x, y = two_timestamps_with_diff.example()
#     print(x, y, y-x)
    
test_dfs = data_frames(
    index=range_indexes(min_size=20, max_size=100),
    columns=columns(["start_time", "end_time"], dtype=int),
    rows=two_timestamps_with_diff, 
)

# sample code to examine the results of this strategy
# res = test_dfs.example()
# res.assign(d = res.end_time - res.start_time)

# a test with an assertion that validates this constraint. 
@given(df=test_dfs)
def test_hyothesis(df):
    logging.info(df)
    assert ((df.end_time - df.start_time) >= MIN_DIFF_SECONDS).all()
    
# run the test. It passes. 
test_hyothesis()

如果您想向自动生成的数据框添加其他列，请执行以下操作（在此示例中，新列是“a”和“b”）：

from hypothesis.strategies import composite

@composite
def test_df_with_additional_columns(draw, elements=test_dfs):
    df = draw(test_dfs)
    
    class GetIndex(st.SearchStrategy[pd.core.indexes.range.RangeIndex]): 
        def do_draw(self, _):
            return df.index    
    
    more_col_strategy = data_frames([column('A', dtype=int), 
                                     column('B', dtype=float)], 
                                    index = GetIndex()
                                   )
    
    more_cols = draw(more_col_strategy)
    
    return pd.concat([df, more_cols], axis=1)

test_df_with_additional_columns().example()

python - 如何在假设数据框中实现依赖列

1 回答 1

Related

Reference