In [1]:
import datetime
from multiprocessing import Pool
import os
import pdb
import re
import shutil

import cxutils as cx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wfdb
%matplotlib qt

#os.getcwd()
base_project_dir = '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation'
base_write_dir = os.path.join(base_project_dir, 'output/')

Subject numbers are in the form **SXXXX**

# Dataset 1 - Stand sit tests

## Directories

- Input directory: `labview`
- Output directory: `sit-stand`

## Data Description

Labview and windaq files. 7 channel 500Hz. Files in the form `S####A.dat` or `S####A.wdq`.

Channels are:
0. marker
1. ecg
2. abp
3. thermst
4. flow rate
5. o2
6. co2

There are many more `dat` files than `wdq` files.

- There is one anomalous file that ends with 'A.dat': `S0361SACA.dat`, in addition to the expected `S0361SA.dat`.
- There are 3 files: S0214SA, S0218SA.dat, S0221SA.dat, which have file lengths not divisible by 7.

In [15]:
input_dir = os.path.join(base_project_dir, 'labview')
write_dir = os.path.join(base_write_dir, 'sit-stand')

input_files = [f for f in cx.list_files(input_dir, extensions=['dat']) if f.endswith('A.dat')]

In [None]:
# Inspecting for format.
# We know there are supposed to be 7 channels.

for file in input_files[:5]:
    file_size = os.path.getsize(file)
    
    # This is expected to be 0. And it is for all files except: S0214SA.dat S0218SA.dat S0221SA.dat
    print(file_size % 7)
    
    if file_size % 7:
        continue
        
    # Inspect duration. We know it should be about 5 minutes sitting, 5 minutes standing, for at least 10m (600s) total.
    # Time (s) = samples per channel / 500
    bytes_per_chan = file_size / 7
    # 4 bytes per sample gives us range from 600 to 1000+ seconds (except anomaly 90s). Seems reasonable. 
    # If 8 bytes, it would not be long enough.
    print(bytes_per_chan / 500 / 4)
    
    # Graph the waveforms. Sweep parameters: endianness, bits, signed/unsigned/float.
    # It seems >f4 is the correct format. Only int8 (signed or unsigned) gives any other reasonable wave
    # but due to duration, logic (no 8 bit precision), and magnitude, we decide that >f4 is correct.
    i = 0
    
    for endian in ['>', '<']:
        for fmt in ['i', 'f']:
            for bit in ['1', '2', '4', '8']:
                
                if fmt == 'f' and int(bit) < 4:
                    continue
                    
                dtype = endian + fmt + bit
                a = np.fromfile(file, dtype).reshape((-1, 7))
                plt.figure(i)
                plt.title(dtype)
                # Plot channel 1, which should be ecg
                plt.plot(a[:,1])
                i += 1
            
    plt.show()

# Correct format is: >f4

In [None]:
# Inspect the anomalous files.
for file in input_files:
    file_size = os.path.getsize(file)
    
    # S0214SA.dat S0218SA.dat S0221SA.dat have incorrect file sizes for 7 channels
    if file_size % 7:
        # Read the maximum block of 28 bits (7 channels, 4 bytes/sample) and visualize
        readable_size = file_size - file_size % 28
        sig = np.fromfile(file, '>f4', count=int(readable_size/4)).reshape((-1, 7))
        sig[:, 2] = sig[:, 2] * 100
        sig[:, 5] = sig[:, 5] * 9.09
        sig[:, 6] = sig[:, 6] * 100
        wfdb.plot_items(sig, title=cx.basebasename(file))
        input()
    
    # S0361SACA.dat has an anomalous name in addition to the expected S0361SA.dat.
    if file in [os.path.join(input_dir, f) for f in ['S0361SACA.dat', 'S0361SA.dat']]:
        sig = np.fromfile(file, '>f4').reshape((-1, 7))
        sig[:, 2] = sig[:, 2] * 100
        sig[:, 5] = sig[:, 5] * 9.09
        sig[:, 6] = sig[:, 6] * 100
        
        wfdb.plot_items(sig, title=cx.basebasename(file))
        input()
        
# Comments: S0214SA, S0218SA.dat, S0221SA.dat, look like nothing in 7 channel option. Ranges make no sense.
# S0361SA.dat looks correct. sig_len ~= 360000. S0361SACA.dat channels actually have same amplitude range as other files,
# but it seems massively downsampled. No useful recognizable waveform shape.


In [None]:
# Investigate whether the non-7 channel files may have another number of channels
# Inspect the anomalous files. 
for file in input_files:
    file_size = os.path.getsize(file)
    # S0214SA.dat S0218SA.dat S0221SA.dat have incorrect file sizes for 7 channels
    if file_size % 7:
        # Read the maximum block of 28 bits (7 channels, 4 bytes/sample) and visualize
        for n_sig in range(1, 9):
            readable_size = file_size - file_size % (4 * n_sig)
            sig = np.fromfile(file, '>f4', count=int(readable_size/4)).reshape((-1, n_sig))
            wfdb.plot_items(sig, title=cx.basebasename(file))
            input()

# All channels from 1-8 do not look like proper data. Ignore these files.

In [9]:
# Write the files
for file in input_files:
    file_size = os.path.getsize(file)
    # Skip the anomalous files
    if file_size % 7 or file.endswith('S0361SACA.dat'):
        continue
    
    record_name = 's' + cx.basebasename(file).replace('SA', '-sit-stand')[1:]
    if '00231' in record_name:
        record_name = record_name.replace('00231', '0231')
    # There's a file S00231A.dat. Should be S0231.
    sig = np.fromfile(file, '>f4').reshape((-1, 7))
    
    # Apply calibrations according to spreadsheet
    sig[:, 2] = sig[:, 2] * 100
    sig[:, 5] = sig[:, 5] * 9.09
    sig[:, 6] = sig[:, 6] * 7.74
    
    wfdb.wrsamp(record_name, fs=500, units=['NU','mV', 'mmHg','NU', 'NU', 'mmHg', 'mmHg'],
                sig_name=['marker', 'ecg', 'abp', 'thermst', 'flow_rate', 'o2', 'co2'],
                p_signal=sig, fmt=['16'] * 7, write_dir=write_dir)

# Overall comments: The amplitude of the C02 channels between records seems to be in 2 groups.

In [None]:
# Inspect the written files
for r in cx.list_records(write_dir):
    record = wfdb.rdrecord(r)
    wfdb.plot_wfdb(record, title=r)
    input()

# Dataset 2 - 24h bp

## Directories

- Input directory: 24h-bp
- Output directory: 24h-bp

## Data description

Text files of bp summaries, collected every 20-30m. `*.R` and `*.V` files.

In [166]:
input_dir = os.path.join(base_project_dir, '24h-bp')
write_dir = os.path.join(base_write_dir, '24h-bp')

input_files = cx.list_files(input_dir, extensions=['R','V'])

In [167]:
# They seem to be readable. Just change names

# Renaming:
# Got rid of extra 0 in S00169SA.R and S00169SA.V. S0033A.L/R -> S0033SA.L/R
# There is S02324SA.R and S02324SA.V. This subject number is incorrect?
# S0205.R and S0205SA.R, and S0205.V and S0205SA.V have same content. Delete S0205.R and S0205.V.

# all_names = []
for file in input_files:
    
    if file.endswith('.R'):
        file_type = 'raw'
    else:
        file_type = 'verified'
    
    base_name = cx.basebasename(file).lower()
    
    if base_name.endswith('sa'):
        file_name = '-'.join([base_name[:-2], 'bp', file_type, '0']) + '.txt'
    elif base_name.endswith('sb'):
        file_name = '-'.join([base_name[:-2], 'bp', file_type, '1']) + '.txt'
    else:
        file_name = '-'.join([base_name, 'bp', file_type, '0']) + '.txt'
    
    # all_names.append(file_name)
    shutil.copyfile(file, os.path.join(write_dir, file_name))
    
# print(len(input_files), len(all_names), len(set(all_names)))  # These should all match

# Dataset 3 - Beat to beat bp


## Directories

- Input directory: portapress, portapress-new
- Output directory: resting-bp

## Data Description

Recorded with portapress/beatscope. Resting continuous bp for 1.5/2 hours in supine, reclining, or sitting position.



In [None]:
input_dir = os.path.join(base_project_dir, 'portapress')
write_dir = os.path.join(base_write_dir, 'resting-bp')
input_files = cx.list_files(input_dir, extensions=['dat'])

In [None]:
# File sizes are not all divisible by 2. Seems we are forced to use the beatscope software to figure out
# the format.
for file in input_files:
    file_size = os.path.getsize(file)
    #print(file_size % 2)

# Dataset 4 - 24h Myography

## Directories

- Input directory: me6000
- Output directory: 24h-electromyography

## Data Description

We have .markers, .TFF, and .txt files. But not all records have text/marker files. They all have tff, so we have to convert these. The tff files should have 7 channels. But they're not all 7.

They contain ecg, eeg, and accelerometer data.


header values od -t u2 -N 512 --endian=big 07031501.TFF

sample values od -t d2 -N 512 --endian=big 07031501.TFF



In [None]:
from wfdb.io import rdtff

input_dir = os.path.join(base_project_dir, 'me6000/data')
write_dir = os.path.join(base_write_dir, '24h-electromyography')

# tff files are separated by subjects
subject_dirs = [d for d in cx.list_dirs(input_dir) if os.path.basename(d).lower().startswith('s')]
input_files = cx.list_files(input_dir, extensions=['dat'])


tff_files = cx.list_files(subject_dirs, extensions=['TFF'])

# I renamed the files. Lowercased the 's', replace underscores with hyphens, and append subject
# numbers to tff files if missing.

In [None]:
# Identifying all the problem files

def tryread(filename):
    """
    Return the filename if it fails
    """
    name = os.path.basename(filename)
    try:
        _ = iordtff(filename)
        print('%s succeeded' % name)
    except:
        print('%s failed' % name)
        return filename

# tff files are separated by subjects
subject_dirs = [d for d in cx.list_dirs(input_dir) if os.path.basename(d).lower().startswith('s')]
tff_files = cx.list_files(subject_dirs, extensions=['TFF'])

with Pool(processes=30) as pool:
    problem_files = pool.map(tryread, tff_files)
problem_files = list(set([f for f in problem_files if f]))

In [None]:
# These problem files have incorrect lengths for their specified number of channels.
# I read up to just before the end, and plot the signals.
problem_files = [
    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0355/s0355-07090605.TFF',
    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0164/s0164-06033006.TFF',
    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0371/s0371-07091105.TFF',
    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0376/s0376-07091805.TFF',
    '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0343/s0343-07080705.TFF'
]

def getproblem(file):
    signal, fields, markers, triggers = rdtff(file, cut_end=True)
    print('Finished file %s' % file)
    return signal, fields, markers, triggers

with Pool(processes=5) as pool:
    values = pool.map(getproblem, problem_files)

for i in range(len(problem_files)):
    signal, fields, markers, triggers = values[i]
    wfdb.plot_items(signal, ylabel=fields['sig_name'], title=os.path.basename(problem_files[i]))
    
# They look file, so we will take them.

In [None]:
# The final conversion. Convert ALL tff files to wfdb
def convert_tff(file):
    """
    Convert the tff to wfdb
    """
    problem_files = [
        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0355/s0355-07090605.TFF',
        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0164/s0164-06033006.TFF',
        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0371/s0371-07091105.TFF',
        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0376/s0376-07091805.TFF',
        '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation/me6000/data/s0343/s0343-07080705.TFF'
    ]
    if file in problem_files:
        cut_end = True
    else:
        cut_end = False
    
    # Read the tff file
    signal, fields, markers, triggers = rdtff(file, cut_end=cut_end)
    
    # Write the wfdb record file
    wfdb.wrsamp(record_name=cx.basebasename(file), fs=fields['fs'], d_signal=signal,
                sig_name=fields['sig_name'], adc_gain=[1]*signal.shape[1],
                fmt=['16']*fields['n_sig'], baseline=[0]*fields['n_sig'],
                units=['uV']*fields['n_sig'], base_time=fields['base_time'],
                base_date=fields['base_date'], write_dir=write_dir)
    # Write any annotation locations
    if markers.size:
        wfdb.wrann(record_name=cx.basebasename(file), sample=markers, extension='marker',
                  symbol=['"']*markers.size, aux_note=['marker']*markers.size, write_dir=write_dir)
    if triggers.size:
        wfdb.wrann(record_name=cx.basebasename(file), sample=triggers, extension='trigger',
                  symbol=['"']*triggers.size, aux_note=['trigger']*triggers.size, write_dir=write_dir)
    print('Converted file %s' % os.path.basename(file))

# NEED MORE RAM! Wish I could use more processes.
with Pool(processes=8) as pool:
    _ = pool.map(convert_tff, tff_files)

print('DONE!')


# Dataset 5 - Pedar
## Directories

- Input directory: `pedar`
- Output directory: `walking`

## Data Description

There are `.ASC`, `.9rg`, and `gtc` text files. There are also some xls duplicates which are not useful.

- All 9rg files have 18 lines and only have numbers/nan. 9 channel.
- ASC have text headers. 99 channel.
- `gtc` files have N channels. Space delimited. What are channel names? Double space delimitation...

12 minute walk test.

Problems: S0378A1.9rg same name different content in 9rg and asc folders

In [26]:
# 1. gtc files. There is no date/time info.
# I renamed the column headings and got rid of the extra '*' characters present in some lines
# I renamed SO197S-1.gtc (S OH 197... why do this???) to S0197S-2.gtc and SO157S-1.gtc to SO157S-4.gtc
input_dir = os.path.join(base_project_dir, 'pedar/gtc')
write_dir = os.path.join(base_write_dir, 'walking')
input_files = cx.list_files(input_dir, extensions=['gtc'])

# Convert the 9rg files to header and csv text files.

def read_9rg(file):
    header_lines = []
    with open(file, 'r') as fp:
        for line in fp:
            if line.startswith('step_no'):
                break
            header_lines.append(line.strip())
    data = pd.read_csv(file, delim_whitespace=True, skiprows=len(header_lines))
    return header_lines, data

# Write data to csv files, and headers to txt files
for file in input_files:
    lines, data = read_9rg(file)
    # First line is useless
    with open(os.path.join(write_dir, cx.basebasename(file)+'-steps-info.txt'), 'w') as fp:
        for line in lines[1:]:
            fp.write("%s\n" % line)
    data.to_csv(os.path.join(write_dir, cx.basebasename(file)+'-steps.csv'), index=False)


In [27]:
# 2. asc files. The fs is specified in the header info. There is no date/time info.
# Get the data and convert to wfdb
# I renamed SO157S-1.asc to S0157s-4.asc, to link to the original SO157S-1.gtc mentioned above

base_project_dir = '/home/cx1111/Projects/pn-contribution-review/under-review/cerebral-vasoregulation'
base_write_dir = os.path.join(base_project_dir, 'output/')

input_dir = os.path.join(base_project_dir, 'pedar/asc')
write_dir = os.path.join(base_write_dir, 'walking')
input_files = cx.list_files(input_dir, extensions=['ASC'])

def read_asc(file):
    with open(file, 'r') as fp:
        n_header_lines = 0
        for line in fp:
            line = line.strip()
            n_header_lines += 1
            # Try to find fs. Format: 'time per frame[secs]:  0.019'
            if 'time per frame' in line:
                rx = re.compile('time per frame\[secs\]\:\s+(?P<period>\d{1}\.\d+)')
                period = re.findall(rx, line)[0]
                fs = 1/float(period)
            if line.startswith('time'):
                break
        data = pd.read_csv(file, delim_whitespace=True, skipinitialspace=True, skiprows=n_header_lines, header=None)
        return data, fs

for file in input_files:
    data, fs = read_asc(file)
    n_sig = data.shape[1] - 1
    wfdb.wrsamp(cx.basebasename(file)+'-pressure', fs=fs, units=n_sig*['N/cm2'],
                sig_name=['m1_pressure_'+str(ch) for ch in range(1, int(n_sig/2)+1)]+['m2_pressure_'+str(ch) for ch in range(1, int(n_sig/2)+1)],
                p_signal=data.iloc[:, 1:].values, fmt=n_sig*['16'], write_dir=write_dir)


In [30]:
# 3. 9rg files. Just rename them and replace whitespaces with commas

input_dir = os.path.join(base_project_dir, 'pedar/9rg')
write_dir = os.path.join(base_write_dir, 'walking')
input_files = cx.list_files(input_dir, extensions=['9rg'])

rx = re.compile(r'\s+')
for file in input_files:
    lines = cx.read_lines(file)
    lines = [rx.sub(',', line.strip()) for line in lines]
    cx.write_lines(os.path.join(write_dir, cx.basebasename(file)+'-9rg.csv'), lines)
    
    

# Dataset 6 - Head up tilt
## Directories

- Input directory: `labview`
- Output directory: `head-up-tilt`

## Data Description

`S####B.dat` 10 channel labview files.

In [13]:
input_dir = os.path.join(base_project_dir, 'labview')
write_dir = os.path.join(base_write_dir, 'head-up-tilt')

input_files = [f for f in cx.list_files(input_dir, extensions=['dat']) if f.endswith('SB.dat')]

In [14]:
# Write the files
# All file_size % 40 == 0 which is good. 10 channel >f4
for file in input_files:
    sig = np.fromfile(file, '>f4').reshape((-1, 10))
    record_name = 's' + cx.basebasename(file).replace('SB', '-head-up-tilt')[1:]
    # Apply calibrations according to spreadsheet
    sig[:, 2] = sig[:, 2] * 100
    sig[:, 8] = sig[:, 8] * 9.09
    sig[:, 9] = sig[:, 9] * 7.74
    
    wfdb.wrsamp(record_name, fs=500, units=['NU','mV', 'mmHg','cm/s', 'cm/s', 'cm/s', 'NU', 'NU', 'mmHg', 'mmHg'],
                sig_name=['marker', 'ecg', 'abp', 'mcar', 'mcal', 'radi', 'thermst', 'flow_rate', 'o2', 'co2'],
                p_signal=sig, fmt=['16'] * 10, write_dir=write_dir)


In [None]:
# Inspect the written files
for r in cx.list_records(write_dir):
    record = wfdb.rdrecord(r)
    wfdb.plot_wfdb(record, title=r)
    input()

# Dataset 7 - Sit to stand with eyes open and closed on balance platform

## Directories

- Input directory: `labview`
- Output directory: `sit-stand-balance`

## Data Description

`S####C.dat` 15 channel labview files.

In [16]:
input_dir = os.path.join(base_project_dir, 'labview')
write_dir = os.path.join(base_write_dir, 'sit-stand-balance')

input_files = [f for f in cx.list_files(input_dir, extensions=['dat']) if f.endswith('SC.dat')]

In [17]:
# Write the files
# All file_size % 60 == 0 which is good. 15 channel >f4

for file in input_files:
    sig = np.fromfile(file, '>f4').reshape((-1, 15))
    record_name = 's' + cx.basebasename(file).replace('SC', '-sit-stand-balance')[1:]
    # Apply calibrations according to spreadsheet
    sig[:, 2] = sig[:, 2] * 100
    sig[:, 8] = sig[:, 8] * 9.09
    sig[:, 9] = sig[:, 9] * 7.74
    # There are inf values annoyingly
    sig[np.where(np.isinf(sig))] = np.nan
    wfdb.wrsamp(record_name, fs=500,
                units=['NU','mV', 'mmHg','cm/s', 'cm/s', 'cm/s', 'NU', 'ml/s', 'mmHg', 'mmHg', 'mm', 'mm', 'mm', 'mm', 'mm'],
                sig_name=['marker', 'ecg', 'abp', 'mcar', 'mcal', 'radi', 'thermst', 'flow_rate', 'o2', 'co2', 'fx', 'fy', 'fz', 'px', 'py'],
                p_signal=sig, fmt=['16'] * 15, write_dir=write_dir)


# Dataset 8 - Transcranial Doppler
## Directories

- Input directory: tcd/exported-all
- Output directory: transcranial-doppler

## Data Description

`SXXXXSB.XL0` and `SXXXXSB.XL1` files. For the same subject, XL1 comes after XL0.

Issues:
- `S0185SB.XL0` has channels E2, E3, E7.... What are these?
- Only `S0078SB.XL0` has etco2 channel.
- `S0172SB.XL0` has two mcal channels. mcal comes before mcar. Should be ok? Drop the second mcal like we drop mcar.

In [19]:
input_dir = os.path.join(base_project_dir, 'tcd/exported-all')
write_dir = os.path.join(base_write_dir, 'transcranial-doppler')
input_files = cx.list_files(input_dir, extensions=['XL0', 'XL1'])

In [3]:
# There are 3 anomalously named files: S0183.XL1 and S0121.XL1, S0176SC.XL0. Their contents are identical to S0183SB.XL1
# S0121SB.XL1, S0176SB.XL1 so I deleted them. rm S0183.XL1 S0121.XL1 S0176SC.XL0

# There's the file S0000SB.XL0 but there is no s0. Inside the file says: patient exam: S0172SC. Renamed the
# file S0172SB.XL1 since the time starts just a bit after S0172SB.XL0. Renamed S0030DB.XL1 to S0030SB.XL1

# Some files have extra labels in certain rows.
# Extra annotations include: 'VALSALVA, 'BASELINE', 'T-HYPOVENT', 'STAND-EC', 'STAND-EO', 'HYPERVENT', 'HYPOVENT'
# 'TILT'

# Before reading csv content, we must extract these annotations into separate files, and clean them from the files.
for file in input_files:
    # Figure out header line
    with open(file, 'r') as f:
        n_header = 0
        for line in f:
            if line.startswith('TIME') and 'MCAR' in line:
                n_header += 2
                break
            n_header += 1
    dname, fname = os.path.split(file)

    cx.clean_dirty_csv(file, output_csv_file=os.path.join(dname, 'clean-'+fname.lower()),
                       output_bad_file=os.path.join(dname, 'extra-'+fname.lower()), skiprows=n_header)


In [24]:
# Convert the data files into wfdb
input_files = cx.list_files(input_dir, extensions=['xl0', 'xl1'])
input_files = [f for f in input_files if os.path.basename(f).startswith('clean')]

def read_tcd(file):
    header_lines = []
    with open(file, 'r') as fp:
        for line in fp:
            if line.startswith('TIME') and 'MCAR' in line:
                sig_names = [s.lower() for s in line.split()][1:]
                break
            header_lines.append(line.strip())
            
    data = pd.read_csv(file, delim_whitespace=True, skiprows=len(header_lines)+2, header=None)
    base_time = datetime.datetime.strptime(data.iloc[0,0], '%H:%M:%S.%f').time()
    
    data = data.iloc[:, 1:]
    
    # Remove extra mcar mcal channels.
    unwanted_inds = []
    for cerebral_name in ['mcar', 'mcal']:
        cerebral_inds = [i for i, e in enumerate(sig_names) if e == cerebral_name]
        if len(cerebral_inds) > 1:
            unwanted_inds += cerebral_inds[1:]
            
    if len(unwanted_inds):
        wanted_inds = list(set(range(len(sig_names))) - set(unwanted_inds))
        sig_names = [sig_names[i] for i in wanted_inds]
        data = data.iloc[:, wanted_inds]
    
    # Alter signal names
    name_map = {'mcar':'cerebral_blood_velocity_right','mcal':'cerebral_blood_velocity_left',
                'brar':'brachial_blood_velocity', 'etco2':'et_co2', 'abp':'abp', 'co2':'co2', 'rsp':'resp',
                'ekg':'ecg', 'e1':'e1', 'e2':'e2', 'e3':'e3', 'e7':'e7', 'e8':'e8', 'hr':'hr'}
    sig_names = [name_map[s] for s in sig_names]
    

    
    unit_dict = {'cerebral_blood_velocity_right': 'mmHg', 'cerebral_blood_velocity_left':'mmHg',
                 'brachial_blood_velocity':'mmHg', 'abp':'mmHg', 'co2':'mmHg','et_co2':'mmHg',
                 'resp':'NU', 'ecg':'uV', 'e1':'mmHg', 'e2':'mmHg', 'e3':'mmHg', 'e7':'mmHg', 'e8':'mmHg', 'hr':'bpm'}
    units = [unit_dict[s] for s in sig_names]
    return header_lines, sig_names, units, base_time, data

# Write data to wfdb files
# I don't think there's any important info in the header lines
for file in input_files:
    # Read the main data
    header_lines, sig_names, units, base_time, data = read_tcd(file)
    
    # Read the corresponding annotations
    ann_data = pd.read_csv(file.replace('clean', 'extra'), delim_whitespace=True, header=None)
    
    
    n_sig = data.shape[1]
    record_name = cx.basebasename(file)[6:-2] + '-tcd-' + file[-1]

    wfdb.wrsamp(record_name, fs=50, units=units,
                sig_name=sig_names,
                p_signal=data.values, fmt=n_sig*['16'],
                base_time=base_time,
                write_dir=write_dir)

In [23]:
ann_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,11:57:18.410,34.3,0.0,0.0,29.2,0.0,87.9,36.3,18.6,-1.5,-11.8,BASELINE
1,12:02:37.320,27.5,0.0,0.0,32.6,-1.2,77.4,38.5,17.1,-1.5,-11.1,STAND-EO
2,12:02:39.380,25.7,0.0,0.0,29.2,1.7,72.0,39.0,4.7,-1.5,-22.0,STAND-EO
3,12:05:52.740,36.0,0.0,0.0,36.0,4.3,96.6,39.5,40.4,-1.5,-20.5,BASELINE
4,12:10:59.660,32.6,0.0,0.0,37.8,0.0,82.6,36.2,35.0,-1.5,-9.0,STAND-EC
5,12:14:12.060,27.5,0.0,0.0,27.5,0.0,70.8,32.7,-0.1,-1.5,-16.6,BASELINE


In [171]:
# Write annotation files for those lines that were removed from the XL files

lines = cx.read_lines()