Source code for getShimadzuData

"""
The .daml files provided by Shimadzu come in an xml format
where key data pieces (chromatogram, observed m/z ratios 
with their corresponding intensity and UV traces) are 
encoded in base64 format. 

The following script extracts the data relevant to PyParse, and reformats
it into a common structure. 

"""

import xml.etree.ElementTree as ET
import base64
import glob
import math


[docs]def init(args):
    """
    Initialise global options. 
    """
    global options
    options = args


[docs]def getData(input_dir):
    """
    Search the directory specified by the user to acquire any and all input 
    LC-MS data that is present. Format this data into the preferred PyParse format.

    :param input_dir: A string specifying the directory to search for .daml files

    :return: A list comprising: a dictionary of all peaks in all wells (indexed by well); 
        a dictionary of all chromatograms; a dictionary of all sample IDs; a dictionary
        total_area_abs of each well. Each of these dictionaries are indexed by well
    
    """
    masterTable = {}
    chroma = {}
    sample_IDs = {}
    total_area_abs = {}

    input_files = glob.glob(input_dir + '*.daml')

    for filename in input_files:
        #Open the .daml file, which is structured as an xml
        tree = ET.parse(filename)
        root = tree.getroot()

        #If the number of rows or number of columns has not yet been 
        #specified, determine these values from the .daml file
        if options.plate_col_no == 0 or options.plate_row_no == 0:
            vessels = root.find("datafile_attribs").find("Plate").find("Vessels")
            #determine the number of columns/rows by how many times 
            #a well has the y-value = 0 or x-val = 0 respectively

            for vessel in vessels.findall("Vessel"):
                if int(vessel.get("x")) == 0:
                    options.plate_row_no += 1
                if int(vessel.get("y")) == 0:
                    options.plate_col_no += 1

        #get the well number
        wellno = int(root.find("datafile_attribs").find("vial").text)

        #get the sampleID
        sample_IDs[wellno] = str(root.find("datafile_attribs").find("desc").text)

        #Get to the results secton of the .daml file
        results = root.find("results")

        peaks = {}
        #start by getting the peaks observed in the UV spectrum
        raw_UV = results.find("lc_chros").find("chro")

        for peak in raw_UV.findall("peak"):
            #Fetch all data about a peak from the details section
            new_entry = {}
            
            for detail in peak.get("details").split(","):
                name = detail.strip().split("=")[0]
                value = detail.strip().split("=")[1]
                if name == "Peak#":
                    new_entry["peakID"] = int(value)
                elif name == "RT":
                    new_entry["time"] = float(value.split("mins")[0].strip())
                    new_entry["pStart"] = float(value.split("(")[1].split("-")[0])
                    new_entry["pEnd"] = float(value.split("-")[1].split("mins")[0])
                elif name == "Area":
                    new_entry["areaAbs"] = float(value)
                elif name == "Area%":
                    new_entry["area"] = float(value)
            #Add placeholders for the correct data structure
            new_entry["MS+"] = [[]]
            new_entry["MS-"] = [[]]

            #Add well number to the new entry
            new_entry["well"] = wellno

            #Add the new entry to the peaks dictionary
            peaks[new_entry["peakID"]] = new_entry

        #get the chromatographic trace for this well

        coded_trace = raw_UV.find("data").text
        decoded_trace = base64.b64decode(coded_trace)
        trace_list = str(decoded_trace).split(";;;")[1].split("'")[0].split(";")

        xval = []
        yval = []
        for i in range(0, len(trace_list), 2):
            xval.append(float(trace_list[i])/60000)
            yval.append(float(trace_list[i+1]))

        #Normalise the heights of the peaks so that they plot correctly

        max_y_val = max(yval)
        normalised_yval = [100*i/max_y_val for i in yval]
        chroma[wellno] = [xval, normalised_yval]

        #next, get the mass spec data for each peak, and match it to the UV peaks

        raw_ms = results.find("spectra")
        for spec in raw_ms.findall("spec"):
            peakID = int(spec.find("peak_num").text)
            if peakID in peaks.keys():
                msdata = spec.find("data").text
                decoded = base64.b64decode(msdata)

                #determine whether this data is for MS+ or MS-
                ms_type = "MS+" if str(decoded).find("MS(+)") > -1 else "MS-"

                #Get the list of masses with their intensities, and group them
                #pairwise ([massion, intensity])
                decoded_list = str(decoded).split(";;;")[1].split("'")[0].split(";")

                masses = []
                total = 0
                for i in range(0, len(decoded_list), 2):
                    datapair = [float(decoded_list[i])/10000, float(decoded_list[i+1])]
                    masses.append(datapair)
                
                #normalise the intensities 
                max_intensity = max(masses, key = lambda x: x[1])[1]
                normalised_masses = [[i[0], round(i[1]*100/max_intensity, 3)] for i in masses]
                
                #Remove any masses which are so small as to be negligable
                total = sum([i[1] for i in normalised_masses])
                refined_masses = [i for i in normalised_masses if math.floor(i[1]*100/total) > 0]

                if ms_type == "MS+":
                    peaks[peakID]["MS+"] = refined_masses
                else:
                    peaks[peakID]["MS-"] = refined_masses

        #finally, fetch the UV absorption maxima values

        raw_UV_ads = results.find("pda_spectra")
        for pda_spec in raw_UV_ads.findall("pda_spec"):
            #Get the peakID so the pda data is matched to the correct peaks
            peakID = int(pda_spec.find("peak_num").text)
            if peakID in peaks.keys():
                
                UV_maxima = []

                #Decode the base64 data, and group the data into pairs of x/y values
                UV_pairs = []
                pda_data = base64.b64decode(pda_spec.find("data").text)
                decoded_list = str(pda_data).split(";;;")[1].split("'")[0].split(";")
                for i in range(0, len(decoded_list), 2):
                        datapair = [float(decoded_list[i])/100, float(decoded_list[i+1])]
                        UV_pairs.append(datapair)

                #Tag the very first value as a maxima if required
                if UV_pairs[0][1] > UV_pairs[1][1] and UV_pairs[0][1] > options.min_uv_threshold:
                    UV_maxima.append(UV_pairs[0][0])
                
                #Iterate through all other n-1/n/n+1 comparisons to find all other maxima
                for i in range(1, len(UV_pairs)-1):
                    if (UV_pairs[i][1] > UV_pairs[i-1][1] and UV_pairs[i][1] > UV_pairs[i+1][1] 
                        and UV_pairs[i][1] > options.min_uv_threshold):
                        UV_maxima.append(UV_pairs[i][0])
                
                if UV_pairs[-1][1] > UV_pairs[-2][1] and UV_pairs[-1][1] > options.min_uv_threshold:
                    UV_maxima.append(UV_pairs[-1][0])
                #Store these maxima values with the peak 
                peaks[peakID]["UV"] = UV_maxima
        
        
        #Filter out any peaks within retention times specified by the user
        #Expected uses: remove solvent peaks from consideration
        #Expected format is a list of specified ranges, e.g.
        #[min_time1-max_time1, min_time2-max_time2]
        #where the start of the range appears first and is separated from 
        #the end of the range with a hyphen

        if len(options.filter_by_rt) > 0:
            to_remove = []
            for filter_range in options.filter_by_rt:
                min_time = float(filter_range.split("-")[0].strip())
                max_time = float(filter_range.split("-")[1].strip())

                for index in peaks.keys():
                    if float(index) > min_time and index < max_time:
                        to_remove.append(index)

            for index in to_remove:
                del peaks[index]
        
        #If the reprocess_by_rt parameter was used, recalculate the percentage peak area for each peak
        if len(options.filter_by_rt) > 0:
            total_area_abs[wellno] = 0
            for peak in peaks.values():
                total_area_abs[wellno] = total_area_abs[wellno] + peak["areaAbs"]

        #Recalculate the peak area percentages now that the total absolute area 
        #for that well has been calculated. 

        if len(options.filter_by_rt) > 0: 
            for index in peaks.keys():
                peaks[index]["area"] = round((peaks[index]["areaAbs"]*100) / total_area_abs[wellno], 2)

        #Append the peaks to the specific well
        masterTable[wellno] = peaks.values()

    return [masterTable, chroma, sample_IDs, total_area_abs]