I have a dataset with several tables, each in the form of countries, years, and some indicators. I have converted all the excel tables to csv files, then merged them into one table.
The problem is that I have some tables that refuse to be merged, and the following message appears TypeError: '>' not supported between instances of 'int' and 'str'
I tried everything I can, but no luck, still the same error appears!
Also, I tried with hundreds of different files, but there are still tens of files that face this problem.
For the sample files file17.csv and file35.csv (In case someone needs to repeat it). Here are the code I used:
# To load the first file
import pandas as pd
filename1 = 'file17.csv'
df1 = pd.read_csv(filename1, encoding='cp1252', low_memory=False)
df1.set_index(['Country', 'Year'], inplace=True)
df1.dropna(axis=0, how='all', inplace=True)
df1.head()
Out>>>
+-------------+------+--------+--------+
| | | ind500 | ind356 |
| Country | Year | | |
| Afghanistan | 1800 | 603.0 | NaN |
| | 1801 | 603.0 | NaN |
| | 1802 | 603.0 | NaN |
| | 1803 | 603.0 | NaN |
| | 1804 | 603.0 | NaN |
+-------------+------+--------+--------+
In>>>
# To load the second file
filename2 = 'file35.csv'
df2 = pd.read_csv(filename2, encoding='cp1252', low_memory=False)
df2.set_index(['Country', 'Year'], inplace=True)
df2.dropna(axis=0, how='all', inplace=True)
df2.head()
Out>>>
# To merge the two dataframes
gross_df = pd.merge(df1, df2, left_index=True, right_index=True, how='outer')
gross_df.dropna(axis=0, how='all', inplace=True)
print (gross_df.shape)
gross_df.to_csv('merged.csv')
Important notice:
I noticed that in all the successful files, the columns names appear in ascending orders i.e. ind001
, ind009
, ind012
, as they were sorted automatically. while the files with errors have one or more columns with misordered placement like ind500
followed by in356
in the first table and the same applies to the second sample provided.
Notice that the two dataframesindiceswo indices (Country
and year
)
The error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\algorithms.py in safe_sort(values, labels, na_sentinel, assume_unique)
480 try:
--> 481 sorter = values.argsort()
482 ordered = values.take(sorter)
TypeError: '>' not supported between instances of 'int' and 'str'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-11-960b2698de60> in <module>()
----> 1 gross_df = pd.merge(df1, df2, left_index=True, right_index=True, how='outer', sort=False)
2 gross_df.dropna(axis=0, how='all', inplace=True)
3 print (gross_df.shape)
4 gross_df.to_csv('merged.csv')
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\reshape\merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator)
52 right_index=right_index, sort=sort, suffixes=suffixes,
53 copy=copy, indicator=indicator)
---> 54 return op.get_result()
55
56
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\reshape\merge.py in get_result(self)
567 self.left, self.right)
568
--> 569 join_index, left_indexer, right_indexer = self._get_join_info()
570
571 ldata, rdata = self.left._data, self.right._data
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\reshape\merge.py in _get_join_info(self)
720 join_index, left_indexer, right_indexer = \
721 left_ax.join(right_ax, how=self.how, return_indexers=True,
--> 722 sort=self.sort)
723 elif self.right_index and self.how == 'left':
724 join_index, left_indexer, right_indexer = \
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\indexes\base.py in join(self, other, how, level, return_indexers, sort)
2995 else:
2996 return self._join_non_unique(other, how=how,
-> 2997 return_indexers=return_indexers)
2998 elif self.is_monotonic and other.is_monotonic:
2999 try:
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\indexes\base.py in _join_non_unique(self, other, how, return_indexers)
3076 left_idx, right_idx = _get_join_indexers([self.values],
3077 [other._values], how=how,
-> 3078 sort=True)
3079
3080 left_idx = _ensure_platform_int(left_idx)
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\reshape\merge.py in _get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
980
981 # get left & right join labels and num. of levels at each location
--> 982 llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys)))
983
984 # get flat i8 keys from label lists
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\reshape\merge.py in _factorize_keys(lk, rk, sort)
1409 if sort:
1410 uniques = rizer.uniques.to_array()
-> 1411 llab, rlab = _sort_labels(uniques, llab, rlab)
1412
1413 # NA group
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\reshape\merge.py in _sort_labels(uniques, left, right)
1435 labels = np.concatenate([left, right])
1436
-> 1437 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
1438 new_labels = _ensure_int64(new_labels)
1439 new_left, new_right = new_labels[:l], new_labels[l:]
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\algorithms.py in safe_sort(values, labels, na_sentinel, assume_unique)
483 except TypeError:
484 # try this anyway
--> 485 ordered = sort_mixed(values)
486
487 # labels:
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\pandas\core\algorithms.py in sort_mixed(values)
469 str_pos = np.array([isinstance(x, string_types) for x in values],
470 dtype=bool)
--> 471 nums = np.sort(values[~str_pos])
472 strs = np.sort(values[str_pos])
473 return _ensure_object(np.concatenate([nums, strs]))
C:\ProgramData\Anaconda2\envs\conda_python3\lib\site-packages\numpy\core\fromnumeric.py in sort(a, axis, kind, order)
820 else:
821 a = asanyarray(a).copy(order="K")
--> 822 a.sort(axis=axis, kind=kind, order=order)
823 return a
824
TypeError: '>' not supported between instances of 'int' and 'str'
This error indicates that indices in merged DF have different dtypes
Demo - how to convert string index level to int
:
In [183]: df
Out[183]:
0 1 2 3
bar 1 -0.205037 0.762509 0.816608 -1.057907
2 1.249104 0.338777 -0.982084 0.329330
baz 1 0.845695 -0.996365 0.548100 -0.113733
2 1.247092 -2.674061 -0.071993 -0.734242
foo 1 -1.233825 -0.195377 -0.240303 1.168055
2 -0.108942 -0.615612 -1.299512 0.908641
qux 1 0.844421 0.251425 -0.506877 1.307800
2 0.038580 0.045072 -0.262974 0.629804
In [184]: df.index
Out[184]:
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['1', '2']],
labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]])
In [185]: df.index.get_level_values(1)
Out[185]: Index(['1', '2', '1', '2', '1', '2', '1', '2'], dtype='object')
In [187]: df.index = df.index.set_levels(df.index.get_level_values(1) \
.map(lambda x: pd.to_numeric(x, errors='coerce')), level=1)
Result:
In [189]: df.index.get_level_values(1)
Out[189]: Int64Index([1, 2, 1, 2, 1, 2, 1, 2], dtype='int64')
UPDATE: try this:
In [247]: d1 = pd.read_csv('https://docs.google.com/uc?id=1jUsbr5pw6sUMvewI4fmbpssroG4RZ7LE&export=download', index_col=[0,1])
In [248]: d2 = pd.read_csv('https://docs.google.com/uc?id=1Ufx6pvnSC6zQdTAj05ObmV027fA4-Mr3&export=download', index_col=[0,1])
In [249]: d2 = d2[pd.to_numeric(d2.index.get_level_values(1), errors='coerce').notna()]
In [250]: d2.index = d2.index.set_levels(d2.index.get_level_values(1).map(lambda x: pd.to_numeric(x, errors='coerce')), level=1)
In [251]: d1.reset_index().merge(d2.reset_index(), on=['Country','Year'], how='outer').set_index(['Country','Year'])
Out[251]:
ind500 ind356 ind475 ind476 ind456
Country Year
Afghanistan 1800 603.0 NaN NaN NaN NaN
1801 603.0 NaN NaN NaN NaN
1802 603.0 NaN NaN NaN NaN
1803 603.0 NaN NaN NaN NaN
1804 603.0 NaN NaN NaN NaN
1805 603.0 NaN NaN NaN NaN
1806 603.0 NaN NaN NaN NaN
1807 603.0 NaN NaN NaN NaN
1808 603.0 NaN NaN NaN NaN
1809 603.0 NaN NaN NaN NaN
... ... ... ... ... ...
Bahamas, The 1967 NaN NaN NaN NaN 18381.131314
Gambia, The 1967 NaN NaN NaN NaN 937.355288
Korea, Dem. Rep. 1967 NaN NaN NaN NaN 1428.689253
Lao PDR 1967 NaN NaN NaN NaN 1412.359955
Netherlands Antilles 1967 NaN NaN NaN NaN 14076.731352
Russian Federation 1967 NaN NaN NaN NaN 11794.726437
Serbia and Montenegro 1967 NaN NaN NaN NaN 2987.080489
Syrian Arab Republic 1967 NaN NaN NaN NaN 2015.913906
Yemen, Rep. 1967 NaN NaN NaN NaN 1075.693355
Bahamas, The 1968 NaN NaN NaN NaN 18712.082830
[46607 rows x 5 columns]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With