Switch to faster read_csv() for Hipparcos catalog

Yes, it has fixed-width fields, but Pandas is slow at importing those, so we take advantage of the fact that it also has delimiters!
2020-07-21 07:19:22 -04:00 · 2020-07-21 07:19:22 -04:00 · 0af8e01b4a
parent a7c2794b60
commit 0af8e01b4a
1 changed files with 24 additions and 11 deletions
--- a/skyfield/data/hipparcos.py
+++ b/skyfield/data/hipparcos.py
@ -41,6 +41,21 @@ analysis toolkit.  Try installing it using your usual Python package
 installer, like "pip install pandas" or "conda install pandas".
 """

+_COLUMN_NAMES = (
+    'Catalog', 'HIP', 'Proxy', 'RAhms', 'DEdms', 'Vmag',
+    'VarFlag', 'r_Vmag', 'RAdeg', 'DEdeg', 'AstroRef', 'Plx', 'pmRA',
+    'pmDE', 'e_RAdeg', 'e_DEdeg', 'e_Plx', 'e_pmRA', 'e_pmDE', 'DE:RA',
+    'Plx:RA', 'Plx:DE', 'pmRA:RA', 'pmRA:DE', 'pmRA:Plx', 'pmDE:RA',
+    'pmDE:DE', 'pmDE:Plx', 'pmDE:pmRA', 'F1', 'F2', '---', 'BTmag',
+    'e_BTmag', 'VTmag', 'e_VTmag', 'm_BTmag', 'B-V', 'e_B-V', 'r_B-V',
+    'V-I', 'e_V-I', 'r_V-I', 'CombMag', 'Hpmag', 'e_Hpmag', 'Hpscat',
+    'o_Hpmag', 'm_Hpmag', 'Hpmax', 'HPmin', 'Period', 'HvarType',
+    'moreVar', 'morePhoto', 'CCDM', 'n_CCDM', 'Nsys', 'Ncomp',
+    'MultFlag', 'Source', 'Qual', 'm_HIP', 'theta', 'rho', 'e_rho',
+    'dHp', 'e_dHp', 'Survey', 'Chart', 'Notes', 'HD', 'BD', 'CoD',
+    'CPD', '(V-I)red', 'SpType', 'r_SpType',
+)
+
 def load_dataframe(fobj, compression='gzip'):
    """Given an open file for `hip_main.dat.gz`, return a parsed dataframe.

@ -49,21 +64,19 @@ def load_dataframe(fobj, compression='gzip'):

    """
    try:
-        from pandas import read_fwf
+        from pandas import read_csv
    except ImportError:
        raise ImportError(PANDAS_MESSAGE)

-    names, colspecs = zip(
-        ('hip', (2, 14)),
-        ('magnitude', (41, 46)),
-        ('ra_degrees', (51, 63)),
-        ('dec_degrees', (64, 76)),
-        ('parallax_mas', (79, 86)),  # TODO: have Star load this
-        ('ra_mas_per_year', (87, 95)),
-        ('dec_mas_per_year', (96, 104)),
+    df = read_csv(
+        fobj, sep='|', compression=compression, names=_COLUMN_NAMES,
+        usecols=['HIP', 'Vmag', 'RAdeg', 'DEdeg', 'Plx', 'pmRA', 'pmDE'],
+        na_values=['     ', '       ', '        ', '            '],
+    )
+    df.columns = (
+        'hip', 'magnitude', 'ra_degrees', 'dec_degrees',
+        'parallax_mas', 'ra_mas_per_year', 'dec_mas_per_year',
    )
-
-    df = read_fwf(fobj, colspecs, names=names, compression=compression)
    df = df.assign(
        ra_hours = df['ra_degrees'] / 15.0,
        epoch_year = 1991.25,