Switch to faster read_csv() for Hipparcos catalog

Yes, it has fixed-width fields, but Pandas is slow at importing those,
so we take advantage of the fact that it also has delimiters!
This commit is contained in:
Brandon Rhodes 2020-07-21 07:19:22 -04:00
parent a7c2794b60
commit 0af8e01b4a
1 changed files with 24 additions and 11 deletions

View File

@ -41,6 +41,21 @@ analysis toolkit. Try installing it using your usual Python package
installer, like "pip install pandas" or "conda install pandas".
"""
_COLUMN_NAMES = (
'Catalog', 'HIP', 'Proxy', 'RAhms', 'DEdms', 'Vmag',
'VarFlag', 'r_Vmag', 'RAdeg', 'DEdeg', 'AstroRef', 'Plx', 'pmRA',
'pmDE', 'e_RAdeg', 'e_DEdeg', 'e_Plx', 'e_pmRA', 'e_pmDE', 'DE:RA',
'Plx:RA', 'Plx:DE', 'pmRA:RA', 'pmRA:DE', 'pmRA:Plx', 'pmDE:RA',
'pmDE:DE', 'pmDE:Plx', 'pmDE:pmRA', 'F1', 'F2', '---', 'BTmag',
'e_BTmag', 'VTmag', 'e_VTmag', 'm_BTmag', 'B-V', 'e_B-V', 'r_B-V',
'V-I', 'e_V-I', 'r_V-I', 'CombMag', 'Hpmag', 'e_Hpmag', 'Hpscat',
'o_Hpmag', 'm_Hpmag', 'Hpmax', 'HPmin', 'Period', 'HvarType',
'moreVar', 'morePhoto', 'CCDM', 'n_CCDM', 'Nsys', 'Ncomp',
'MultFlag', 'Source', 'Qual', 'm_HIP', 'theta', 'rho', 'e_rho',
'dHp', 'e_dHp', 'Survey', 'Chart', 'Notes', 'HD', 'BD', 'CoD',
'CPD', '(V-I)red', 'SpType', 'r_SpType',
)
def load_dataframe(fobj, compression='gzip'):
"""Given an open file for `hip_main.dat.gz`, return a parsed dataframe.
@ -49,21 +64,19 @@ def load_dataframe(fobj, compression='gzip'):
"""
try:
from pandas import read_fwf
from pandas import read_csv
except ImportError:
raise ImportError(PANDAS_MESSAGE)
names, colspecs = zip(
('hip', (2, 14)),
('magnitude', (41, 46)),
('ra_degrees', (51, 63)),
('dec_degrees', (64, 76)),
('parallax_mas', (79, 86)), # TODO: have Star load this
('ra_mas_per_year', (87, 95)),
('dec_mas_per_year', (96, 104)),
df = read_csv(
fobj, sep='|', compression=compression, names=_COLUMN_NAMES,
usecols=['HIP', 'Vmag', 'RAdeg', 'DEdeg', 'Plx', 'pmRA', 'pmDE'],
na_values=[' ', ' ', ' ', ' '],
)
df.columns = (
'hip', 'magnitude', 'ra_degrees', 'dec_degrees',
'parallax_mas', 'ra_mas_per_year', 'dec_mas_per_year',
)
df = read_fwf(fobj, colspecs, names=names, compression=compression)
df = df.assign(
ra_hours = df['ra_degrees'] / 15.0,
epoch_year = 1991.25,