consider the following dataframe (note the strings):
df = pd.DataFrame([['3', '11'], ['0', '2']], columns=list('AB'))
df
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
A 2 non-null object
B 2 non-null object
dtypes: object(2)
memory usage: 104.0+ bytes
I'm going to sum. I expect the strings to be concatenated.
df.sum()
A 30.0
B 112.0
dtype: float64
It looks as though the strings were concatenated then converted to float. Is there a good reason for this? Is this a bug? Anything enlightening will be up voted.
Python sum of floats Output: 7.0 If you want to add floating point values with extended precision, you can use math. fsum() function.
Integer ( int ): represents positive or negative whole numbers like 3 or -512. Floating point number ( float ): represents real numbers like 3.14159 or -2.5. Character string (usually called “string”, str ): text. Written in either single quotes or double quotes (as long as they match).
Went with the good old stack trace. Learned a bit about pdb through Pycharm as well. Turns out what happens is the following:
1)
cls.sum = _make_stat_function(
'sum', name, name2, axis_descr,
'Return the sum of the values for the requested axis',
nanops.nansum)
Let's have a look at _make_stat_function
2)
def _make_stat_function(name, name1, name2, axis_descr, desc, f):
@Substitution(outname=name, desc=desc, name1=name1, name2=name2,
axis_descr=axis_descr)
@Appender(_num_doc)
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
_validate_kwargs(name, kwargs, 'out', 'dtype')
if skipna is None:
skipna = True
if axis is None:
axis = self._stat_axis_number
if level is not None:
return self._agg_by_level(name, axis=axis, level=level,
skipna=skipna)
return self._reduce(f, name, axis=axis, skipna=skipna,
numeric_only=numeric_only)
The last line is key. It's kind of funny, as there are about 7 different _reduces
within pandas.core
. pdb says it's the one in pandas.core.frame
. Let's take a look.
3)
def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
filter_type=None, **kwds):
axis = self._get_axis_number(axis)
def f(x):
return op(x, axis=axis, skipna=skipna, **kwds)
labels = self._get_agg_axis(axis)
# exclude timedelta/datetime unless we are uniform types
if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type:
numeric_only = True
if numeric_only is None:
try:
values = self.values
result = f(values)
except Exception as e:
# try by-column first
if filter_type is None and axis == 0:
try:
# this can end up with a non-reduction
# but not always. if the types are mixed
# with datelike then need to make sure a series
result = self.apply(f, reduce=False)
if result.ndim == self.ndim:
result = result.iloc[0]
return result
except:
pass
if filter_type is None or filter_type == 'numeric':
data = self._get_numeric_data()
elif filter_type == 'bool':
data = self._get_bool_data()
else: # pragma: no cover
e = NotImplementedError("Handling exception with filter_"
"type %s not implemented." %
filter_type)
raise_with_traceback(e)
result = f(data.values)
labels = data._get_agg_axis(axis)
else:
if numeric_only:
if filter_type is None or filter_type == 'numeric':
data = self._get_numeric_data()
elif filter_type == 'bool':
data = self._get_bool_data()
else: # pragma: no cover
msg = ("Generating numeric_only data with filter_type %s"
"not supported." % filter_type)
raise NotImplementedError(msg)
values = data.values
labels = data._get_agg_axis(axis)
else:
values = self.values
result = f(values)
if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
try:
if filter_type is None or filter_type == 'numeric':
result = result.astype(np.float64)
elif filter_type == 'bool' and notnull(result).all():
result = result.astype(np.bool_)
except (ValueError, TypeError):
# try to coerce to the original dtypes item by item if we can
if axis == 0:
result = com._coerce_to_dtypes(result, self.dtypes)
return Series(result, index=labels)
Holy smokes, talk about an out of control function. Someone needs a refactoring! Let's zoom in on the trouble line(s):
if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
try:
if filter_type is None or filter_type == 'numeric':
result = result.astype(np.float64)
And you better believe that last line gets executed. Here's some of the pdb trace:
> c:\users\matthew\anaconda2\lib\site-packages\pandas\core\frame.py(4801)_reduce()
-> result = result.astype(np.float64)
(Pdb) l
4796 result = f(values)
4797
4798 if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
4799 try:
4800 if filter_type is None or filter_type == 'numeric':
4801 -> result = result.astype(np.float64)
4802 elif filter_type == 'bool' and notnull(result).all():
4803 result = result.astype(np.bool_)
4804 except (ValueError, TypeError):
4805
4806 # try to coerce to the original dtypes item by item if we can
If you're a non-believer, open up pandas.core.frame.py and put a print "OI"
right above line 4801. It should splat out to console :). Note I'm on Anaconda 2, windows.
I'm going to go with "bug", to answer your question.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With