Source code for simudo.util.xcsv

# copyright 2019 Eduard Christian Dumitrescu
# license: CC0 / https://creativecommons.org/publicdomain/zero/1.0/
'''
This (standalone) module implements a Pandas CSV reader-writer pair
that allows data types to survive a round-trip (where they wouldn't
using plain pandas ``to_csv``). It achieves this by saving some column
metadata to JSON, and by prefixing string values with a ":" character
so that they cannot be confused with NaN values (which are also
allowed in string columns, creating unresolvable ambiguity in the
written data).

See :py:meth:`~XCSV.to_csv` and :py:meth:`~XCSV.read_csv` for more info.

These methods are available as simple functions, so you can do::

    >>> to_xcsv(df, "hello.csv")
    >>> df2, meta = from_xcsv(df, "hello.csv")
'''

import functools
import json
import os
import re
import unittest

import numpy as np
try:
    import pandas as pd
except ImportError:
    pd = None
from cached_property import cached_property

__all__ = ['to_xcsv', 'read_xcsv',
           'XCSV', 'XCSVBase', 'XCSVWriter', 'XCSVReader']


[docs]class XCSVBase(): _STRING_PREFIX = ':' _XCSV_VERSION = 1
[docs] def json_path_from_xcsv_path(self, path): return path + '_meta.json'
@cached_property def json_path(self): return self.json_path_from_xcsv_path(self.path)
[docs] def is_string_column(self, df, dtypes_dict, column_name): return dtypes_dict[column_name] == 'object'
[docs]class XCSVWriter(XCSVBase): @property def string_prefix(self): return self._STRING_PREFIX @cached_property def dtypes_dict(self): df = self.df ddf = df.dtypes.to_frame('dtypes').reset_index() dtypes_dict = ddf.set_index('index')['dtypes'].astype(str).to_dict() return dtypes_dict
[docs] def compute_meta(self): return { 'columns': list(self.df.columns), 'dtypes': self.dtypes_dict, 'string': list(sorted(self.string_columns)), 'string_prefix': self._STRING_PREFIX, 'xcsv_version': self._XCSV_VERSION}
@cached_property def meta(self): return self.compute_meta() @cached_property def string_columns(self): dtypes_dict = self.dtypes_dict df = self.df return set(k for k in dtypes_dict.keys() if self.is_string_column(df, dtypes_dict, k))
[docs] def to_csv(self): df = self.df dtypes_dict = self.dtypes_dict _STRING_PREFIX = self._STRING_PREFIX def string_func(value): if isinstance(value, str): return _STRING_PREFIX + value else: return value dfc = pd.DataFrame(index=df.index) for k in df.columns: series = df[k] if k in self.string_columns: series = series.map(string_func) dfc[k] = series to_csv_kwargs = self.to_csv_kwargs.copy() to_csv_kwargs.setdefault('encoding', 'utf-8-sig') self.write_json_meta() dfc.to_csv(self.path, **to_csv_kwargs)
[docs] def write_json_meta(self): with open(self.json_path, 'wt') as h: json.dump(self.meta, h, sort_keys=True, indent=2)
[docs]class XCSVReader(XCSVBase):
[docs] def load_json_meta(self): with open(self.json_path, 'rt') as h: return json.load(h)
@cached_property def meta(self): return self.load_json_meta() @cached_property def dtypes_dict(self): return self.meta['dtypes'] @cached_property def string_columns(self): return set(self.meta['string']) @cached_property def string_prefix(self): return self.meta['string_prefix']
[docs] def read_csv(self): meta = self.meta xcsv_ver = meta['xcsv_version'] if xcsv_ver != self._XCSV_VERSION: raise ValueError("unexpected xcsv_version {!r} (expected {!r})" .format(xcsv_ver, self._XCSV_VERSION)) read_csv_kwargs = self.read_csv_kwargs.copy() read_csv_kwargs.setdefault('encoding', 'utf-8-sig') dt = self.dtypes_dict datetime_cols = [] for k, v in dt.items(): if v.startswith("datetime"): dt[k] = 'object' datetime_cols.append(k) df = pd.read_csv( self.path, dtype=dt, parse_dates=datetime_cols, **read_csv_kwargs) dfc = pd.DataFrame(index=df.index) for k in df.columns: series = df[k] if k in self.string_columns: self.process_string_column( df=df, column_name=k, series=series) dfc[k] = series return (dfc, meta)
@cached_property def string_prefix_re(self): return '^' + re.escape(self.string_prefix)
[docs] def process_string_column(self, df, column_name, series): series.replace( {'True': True, 'False': False}, inplace=True) # TODO: parse numerical values series.replace( self.string_prefix_re, '', regex=True, inplace=True)
[docs]class XCSV(): reader_class = XCSVReader writer_class = XCSVWriter
[docs] @classmethod def to_csv(cls, df, path, json_path=None, to_csv_kwargs={}): '''Basically the same as :py:meth:`pandas.DataFrame.to_csv`, but with proper escaping for strings to prevent them from being accidentally parsed as numbers or nan, and with column dtypes being written to an accompanying json file. If the csv filename is :code:`"a.csv"`, then the file name containing the metadata will be called :code:`"a.csv_meta.json"`. "XCSV" pronounced "excessive". Warning: mixed-type ("object") columns are assumed to be string columns. So make sure those don't contain anything other than strings or NaN, or your else your data might not survive the roundtrip test. What's definitely safe: - Columns with floats/ints and nans. - Columns with strings and nans. - Columns with booleans (no nans allowed!). ''' obj = cls.writer_class() obj.df = df obj.path = path if json_path is not None: obj.json_path = json_path obj.to_csv_kwargs = to_csv_kwargs return obj.to_csv()
[docs] @classmethod def read_csv(cls, path, json_path=None, read_csv_kwargs={}): '''Opposite of :py:meth:`to_csv`.''' obj = cls.reader_class() obj.path = path if json_path is not None: obj.json_path = json_path obj.read_csv_kwargs = read_csv_kwargs return obj.read_csv()
to_xcsv = XCSV.to_csv read_xcsv = XCSV.read_csv class TestMe(unittest.TestCase): def test_roundtrip(self): import tempfile with tempfile.TemporaryDirectory() as tmp: a = pd.DataFrame({ 'ints': [1,2,3,4], 'ints_and_nan': [np.nan,2,3,4], 'floats': [np.nan, 0.2, 0.3, 0.4], 'strings': "goodbye cruel world xoxo".split(), 'strings_and_nan': [np.nan, "NaN", "2", "4"], 'bool': [True, False, False, True], 'datetime': pd.to_datetime( ["2016-01-01", "2019-05-03 00:30", "2019-05-06 00:00+04:00", np.nan]), # 'bool_and_nan': [True, np.nan, False, True], # not supported # 'mixed': [True, 'True', 2, 4.5] # not supported }) # tmp = "/tmp" path = os.path.join(tmp, "example.csv") # print(a) # print(a.dtypes) to_xcsv(a, path) # print(open(path, 'rt').read()) b = read_xcsv(path, read_csv_kwargs=dict(index_col=0))[0] # print(b) for df in [a, b]: # self.assertTrue(isinstance(df.a.iloc[0], int)) self.assertTrue(isinstance(df.floats.iloc[0], float)) self.assertTrue(isinstance(df.strings_and_nan.iloc[1], str)) # self.assertTrue(isinstance(df.d.iloc[0], bool)) N = len(a) for col in a.columns: for i in range(N): va, vb = a[col].iloc[i], b[col].iloc[i] try: if col in ['datetime']: if va is not vb: # handle NaT self.assertEqual(va, vb) else: # handle NaN np.testing.assert_array_equal(va, vb) except: print("col={!r} i={} {!r} {!r}".format(col, i, va, vb)) raise