I am working a lot of pandas dataframes and I want to test them using pytest and I am using hypothesis to generate the test data.
The issues I am having is that it is generating the same data values in each column.
I actually don't know how to generate real data to test with ..
Here is what I am trying :
from hypothesis.extra.pandas import data_frames , column, range_indexes
from hypothesis import given, settings, strategies as st
import pandas as pd
from datetime import datetime
data = data_frames(columns=[column(name='key', elements=st.floats(allow_nan=True)),
column(name='fbms_start_date', elements=st.datetimes(min_value=datetime(2020, 7, 1),
max_value=datetime.now())),
column(name='breakdown_type', elements=st.just("Total")),
column(name='breakdown_one', elements=st.just(float('nan'))),
column(name='adset_id', elements=st.floats(allow_nan=True)),
column(name='adset_name', elements=st.text()),
column(name='campaign_id', elements=st.floats(allow_nan=True, )),
column(name='campaign_name', elements=st.text()),
column(name='reach', elements=st.text()),
column(name='impressions', elements=st.just(float('nan'))),
column(name='spend', elements=st.floats(allow_nan=False)),
column(name='page_likes', elements=st.floats(allow_nan=False)),
column(name='post_engagement', elements=st.sampled_from(['LINK_CLICKS',
'POST_ENGAGEMENT',
'PAGE_LIKES'])),
column(name='objective', elements=st.floats(allow_nan=False)),
column(name='ads_run', elements=st.sampled_from([True, False]))],
index=range_indexes(min_size=100)
)
@given(df=data)
@settings(max_examples=5)
def test_hyothesis(df):
print(df)
assert 1
this is always generating the following dataset
key fbms_start_date breakdown_type breakdown_one adset_id adset_name campaign_id campaign_name reach impressions spend page_likes post_engagement objective ads_run
0 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
1 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
2 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
3 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
4 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
5 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
6 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
7 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
8 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
9 0.0 2020-07-01 Total 0.0 0.0 0.0 0.0 LINK_CLICKS 0.0 True
And as you can see for that each colunms have one values not unique values , I don't know how to generate real values I can test with ..
Any help will be helpful...
Zac gives some good insight in his answer and from that, I understood why I was getting the issue, I managed to have some data with the code I shared with some hack from the documentation. Although it is not generating 100% of what I wanted it was enough for the testing.
Here is how I generated the columns and afterward the data.
I use the following code to define the columns strategies :
from hypothesis.extra.pandas import data_frames , column, range_indexes
from hypothesis import strategies as st, given, settings
import pandas as pd
from datetime import datetime
datetime_st = st.dates(
min_value=datetime(2020, 7, 1).date(),
max_value=datetime.today().date()
)
float_without_nan_st = st.floats(min_value=0.0001, max_value=3030, allow_nan=False)
float_with_nan_st = st.floats(allow_nan=True, allow_infinity=False)
text_st = st.text(alphabet="espoiristusingacolemakeyboard", min_size=5)
boolean_st = st.boolean()
Then I created the dataframes with :
df_columns = {
"fbms_start_date": {"elements": datetime_st, "unique": True},
"fbmb_spend": {"elements": float_without_nan_st, "unique":True},
"fbmb_adset_id": {"elements": float_with_nan_st, "unique":False, "fill": st.nothing()},
"fbmb_adset_name": {"elements": text_st, "unique":False, "fill": st.nothing()},
"fbmb_ads_run": {"elements": boolean_st},
"fbms_key" : {"elements": float_with_nan_st, "unique":False, "fill": st.nothing()},
"fbmb_breakdown_type": {"elements": st.just("Total")},
"fbmb_breakdown_one": {"elements": st.just(float('nan')) },
"fbmb_campaign_id": {"elements": float_with_nan_st, "unique":False, "fill": st.nothing()},
"fbmb_campaign_name" : {"elements": text_st, "unique":False, "fill": st.nothing()},
"fbmb_reach" : {"elements": text_st, "unique":False, "fill": st.nothing()},
"fbmb_impressions" : {"elements": st.integers(min_value=0, max_value=100001) },
"fbmb_spend" : {"elements": float_with_nan_st, "unique":False, "fill": st.nothing()},
"fbmb_page_likes" : {"elements": float_with_nan_st, "unique":False, "fill": st.nothing()},
"fbmb_post_engagement" : {"elements": st.sampled_from(['LINK_CLICKS',
'POST_ENGAGEMENT',
'PAGE_LIKES']), "unique":False},
"fbmb_objective" : {"elements": float_with_nan_st, "unique":False, "fill": st.nothing()},
}
Next I generated my dataset with :
test_dfs = data_frames(
index=range_indexes(min_size=10),
columns=[column(key, **value) for key, value in df_columns.items()],
)
And finally, I was able to run the following tests
@given(df=test_dfs)
@settings(max_examples=5)
def test_hyothesis(df):
print(df)
assert 1
Note the min_size in the index from the dataset generation and the max_example in the settings.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With