"""An extensible ASCII table reader and writer.
fixedwidth.py:
Read or write a table with fixed width columns.
:Copyright: Smithsonian Astrophysical Observatory (2011)
:Author: Tom Aldcroft (aldcroft@head.cfa.harvard.edu)
"""
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
## * Redistributions of source code must retain the above copyright
## notice, this list of conditions and the following disclaimer.
## * Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimer in the
## documentation and/or other materials provided with the distribution.
## * Neither the name of the Smithsonian Astrophysical Observatory nor the
## names of its contributors may be used to endorse or promote products
## derived from this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
## ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
## WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
## DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
## DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
## (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
## LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
## ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
## SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import re
import itertools
from . import core
from .core import io, next, izip, any
[docs]class FixedWidthSplitter(core.BaseSplitter):
"""Split line based on fixed start and end positions for each ``col`` in
``self.cols``.
This class requires that the Header class will have defined ``col.start``
and ``col.end`` for each column. The reference to the ``header.cols`` gets
put in the splitter object by the base Reader.read() function just in time
for splitting data lines by a ``data`` object.
Note that the ``start`` and ``end`` positions are defined in the pythonic
style so line[start:end] is the desired substring for a column. This splitter
class does not have a hook for ``process_lines`` since that is generally not
useful for fixed-width input.
"""
delimiter_pad = ''
bookend = False
def __call__(self, lines):
for line in lines:
vals = [line[x.start:x.end] for x in self.cols]
if self.process_val:
yield [self.process_val(x) for x in vals]
else:
yield vals
[docs] def join(self, vals, widths):
pad = self.delimiter_pad or ''
delimiter = self.delimiter or ''
padded_delim = pad + delimiter + pad
if self.bookend:
bookend_left = delimiter + pad
bookend_right = pad + delimiter
else:
bookend_left = ''
bookend_right = ''
vals = [' ' * (width - len(val)) + val for val, width in zip(vals, widths)]
return bookend_left + padded_delim.join(vals) + bookend_right
[docs]class FixedWidthHeader(core.BaseHeader):
"""Fixed width table header reader.
The key settable class attributes are:
:param auto_format: format string for auto-generating column names
:param start_line: None, int, or a function of ``lines`` that returns None or int
:param comment: regular expression for comment lines
:param splitter_class: Splitter class for splitting data lines into columns
:param names: list of names corresponding to each data column
:param include_names: list of names to include in output (default=None selects all names)
:param exclude_names: list of names to exlude from output (applied after ``include_names``)
:param position_line: row index of line that specifies position (default = 1)
:param position_char: character used to write the position line (default = "-")
:param col_starts: list of start positions for each column (0-based counting)
:param col_ends: list of end positions (inclusive) for each column
:param delimiter_pad: padding around delimiter when writing (default = None)
:param bookend: put the delimiter at start and end of line when writing (default = False)
"""
position_line = None # secondary header line position
[docs] def get_line(self, lines, index):
for i, line in enumerate(self.process_lines(lines)):
if i == index:
break
else: # No header line matching
raise InconsistentTableError('No header line found in table')
return line
[docs] def get_cols(self, lines):
"""Initialize the header Column objects from the table ``lines``.
Based on the previously set Header attributes find or create the column names.
Sets ``self.cols`` with the list of Columns. This list only includes the actual
requested columns after filtering by the include_names and exclude_names
attributes. See ``self.names`` for the full list.
:param lines: list of table lines
:returns: None
"""
# See "else" clause below for explanation of start_line and position_line
start_line = core._get_line_index(self.start_line, self.process_lines(lines))
position_line = core._get_line_index(self.position_line, self.process_lines(lines))
# If start_line is none then there is no header line. Column positions are
# determined from first data line and column names are either supplied by user
# or auto-generated.
if start_line is None:
if position_line is not None:
raise ValueError("Cannot set position_line without also setting header_start")
data_lines = self.data.process_lines(lines)
if not data_lines:
raise InconsistentTableError('No data lines found so cannot autogenerate column names')
vals, starts, ends = self.get_fixedwidth_params(data_lines[0])
if self.names is None:
self.names = [self.auto_format % i for i in range(1, len(vals) + 1)]
else:
# This bit of code handles two cases:
# start_line = <index> and position_line = None
# Single header line where that line is used to determine both the
# column positions and names.
# start_line = <index> and position_line = <index2>
# Two header lines where the first line defines the column names and
# the second line defines the column positions
if position_line is not None:
# Define self.col_starts and self.col_ends so that the call to
# get_fixedwidth_params below will use those to find the header
# column names. Note that get_fixedwidth_params returns Python
# slice col_ends but expects inclusive col_ends on input (for
# more intuitive user interface).
line = self.get_line(lines, position_line)
vals, self.col_starts, col_ends = self.get_fixedwidth_params(line)
self.col_ends = [x - 1 for x in col_ends]
# Get the header column names and column positions
line = self.get_line(lines, start_line)
vals, starts, ends = self.get_fixedwidth_params(line)
# Possibly override the column names with user-supplied values
if self.names is None:
self.names = vals
# Filter self.names using include_names and exclude_names, then create
# the actual Column objects.
self._set_cols_from_names()
self.n_data_cols = len(self.cols)
# Set column start and end positions. Also re-index the cols because
# the FixedWidthSplitter does NOT return the ignored cols (as is the
# case for typical delimiter-based splitters)
for i, col in enumerate(self.cols):
col.start = starts[col.index]
col.end = ends[col.index]
col.index = i
[docs] def get_fixedwidth_params(self, line):
"""Split ``line`` on the delimiter and determine column values and
column start and end positions. This might include null columns with
zero length (e.g. for ``header row = "| col1 || col2 | col3 |"`` or
``header2_row = "----- ------- -----"``). The null columns are
stripped out. Returns the values between delimiters and the
corresponding start and end positions.
:param line: input line
:returns: (vals, starts, ends)
"""
# If column positions are already specified then just use those, otherwise
# figure out positions between delimiters.
if self.col_starts is not None and self.col_ends is not None:
starts = list(self.col_starts) # could be any iterable, e.g. np.array
ends = [x + 1 for x in self.col_ends] # user supplies inclusive endpoint
if len(starts) != len(ends):
raise ValueError('Fixed width col_starts and col_ends must have the same length')
vals = [line[start:end].strip() for start, end in zip(starts, ends)]
else:
# There might be a cleaner way to do this but it works...
vals = line.split(self.splitter.delimiter)
starts = [0]
ends = []
for val in vals:
if val:
ends.append(starts[-1] + len(val))
starts.append(ends[-1] + 1)
else:
starts[-1] += 1
starts = starts[:-1]
vals = [x.strip() for x in vals if x]
if len(vals) != len(starts) or len(vals) != len(ends):
raise InconsistentTableError('Error parsing fixed width header')
return vals, starts, ends
[docs] def write(self, lines):
# Header line not written until data are formatted. Until then it is
# not known how wide each column will be for fixed width.
pass
[docs]class FixedWidthData(core.BaseData):
"""Base table data reader.
:param start_line: None, int, or a function of ``lines`` that returns None or int
:param end_line: None, int, or a function of ``lines`` that returns None or int
:param comment: Regular expression for comment lines
:param splitter_class: Splitter class for splitting data lines into columns
"""
splitter_class = FixedWidthSplitter
[docs] def write(self, lines):
with self._set_col_formats(self.cols, self.formats):
vals_list = []
# Col iterator does the formatting defined above so each val is a string
# and vals is a tuple of strings for all columns of each row
col_str_iters = [col.iter_str_vals() for col in self.cols]
for vals in izip(*col_str_iters):
vals_list.append(vals)
for i, col in enumerate(self.cols):
col.width = max([len(vals[i]) for vals in vals_list])
if self.header.start_line is not None:
col.width = max(col.width, len(col.name))
widths = [col.width for col in self.cols]
if self.header.start_line is not None:
lines.append(self.splitter.join([col.name for col in self.cols], widths))
if self.header.position_line is not None:
char = self.header.position_char
if len(char) != 1:
raise ValueError('Position_char="%s" must be a single character' % char)
vals = [char * col.width for col in self.cols]
lines.append(self.splitter.join(vals, widths))
for vals in vals_list:
lines.append(self.splitter.join(vals, widths))
return lines
[docs]class FixedWidth(core.BaseReader):
"""Read or write a fixed width table with a single header line that defines column
names and positions. Examples::
# Bar delimiter in header and data
| Col1 | Col2 | Col3 |
| 1.2 | hello there | 3 |
| 2.4 | many words | 7 |
# Bar delimiter in header only
Col1 | Col2 | Col3
1.2 hello there 3
2.4 many words 7
# No delimiter with column positions specified as input
Col1 Col2Col3
1.2hello there 3
2.4many words 7
See the :ref:`fixed_width_gallery` for specific usage examples.
:param col_starts: list of start positions for each column (0-based counting)
:param col_ends: list of end positions (inclusive) for each column
:param delimiter_pad: padding around delimiter when writing (default = None)
:param bookend: put the delimiter at start and end of line when writing (default = False)
"""
def __init__(self, col_starts=None, col_ends=None, delimiter_pad=' ', bookend=True):
core.BaseReader.__init__(self)
self.header = FixedWidthHeader()
self.data = FixedWidthData()
self.data.header = self.header
self.header.data = self.data
self.header.splitter.delimiter = '|'
self.data.splitter.delimiter = '|'
self.data.splitter.delimiter_pad = delimiter_pad
self.data.splitter.bookend = bookend
self.header.start_line = 0
self.data.start_line = 1
self.header.comment = r'\s*#'
self.header.write_comment = '# '
self.data.comment = r'\s*#'
self.data.write_comment = '# '
self.header.col_starts = col_starts
self.header.col_ends = col_ends
[docs]class FixedWidthNoHeader(FixedWidth):
"""Read or write a fixed width table which has no header line. Column
names are either input (``names`` keyword) or auto-generated. Column
positions are determined either by input (``col_starts`` and ``col_stops``
keywords) or by splitting the first data line. In the latter case a
``delimiter`` is required to split the data line.
Examples::
# Bar delimiter in header and data
| 1.2 | hello there | 3 |
| 2.4 | many words | 7 |
# Compact table having no delimiter and column positions specified as input
1.2hello there3
2.4many words 7
This class is just a convenience wrapper around the ``FixedWidth`` reader
but with ``header.start_line = None`` and ``data.start_line = 0``.
See the :ref:`fixed_width_gallery` for specific usage examples.
:param col_starts: list of start positions for each column (0-based counting)
:param col_ends: list of end positions (inclusive) for each column
:param delimiter_pad: padding around delimiter when writing (default = None)
:param bookend: put the delimiter at start and end of line when writing (default = False)
"""
def __init__(self, col_starts=None, col_ends=None, delimiter_pad=' ', bookend=True):
FixedWidth.__init__(self, col_starts, col_ends,
delimiter_pad=delimiter_pad, bookend=bookend)
self.header.start_line = None
self.data.start_line = 0
[docs]class FixedWidthTwoLine(FixedWidth):
"""Read or write a fixed width table which has two header lines. The first
header line defines the column names and the second implicitly defines the
column positions. Examples::
# Typical case with column extent defined by ---- under column names.
col1 col2 <== header_start = 0
----- ------------ <== position_line = 1, position_char = "-"
1 bee flies <== data_start = 2
2 fish swims
# Pretty-printed table
+------+------------+
| Col1 | Col2 |
+------+------------+
| 1.2 | "hello" |
| 2.4 | there world|
+------+------------+
See the :ref:`fixed_width_gallery` for specific usage examples.
:param position_line: row index of line that specifies position (default = 1)
:param position_char: character used to write the position line (default = "-")
:param delimiter_pad: padding around delimiter when writing (default = None)
:param bookend: put the delimiter at start and end of line when writing (default = False)
"""
def __init__(self, position_line=1, position_char='-', delimiter_pad=None, bookend=False):
FixedWidth.__init__(self, delimiter_pad=delimiter_pad, bookend=bookend)
self.header.position_line = position_line
self.header.position_char = position_char
self.data.start_line = position_line + 1
self.header.splitter.delimiter = ' '
self.data.splitter.delimiter = ' '