Skip to content

Documentation for FeatureExtraction.py

FeatureExtraction

This file implements various methods to extract features from different sources.

extract_hmdb_metabolite

extract_hmdb_metabolite(in_file, delete=True)

Extract chemicals from HMDB database

Parameters:

Name Type Description Default
in_file

a zipped HMDB database downloaded from https://hmdb.ca/downloads.

required
delete

whether to delete in_file once it has been processed

True

Returns: a list of vimms.Chemicals.DatabaseCompound objects.

Source code in vimms/FeatureExtraction.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def extract_hmdb_metabolite(in_file, delete=True):
    """
    Extract chemicals from HMDB database

    Args:
        in_file: a zipped HMDB database downloaded from https://hmdb.ca/downloads.
        delete: whether to delete `in_file` once it has been processed

    Returns: a list of [vimms.Chemicals.DatabaseCompound][] objects.

    """
    logger.debug("Extracting HMDB metabolites from %s" % in_file)

    # if out_file is zipped then extract the xml file inside
    try:
        # extract from zip file
        zf = zipfile.ZipFile(in_file, "r")
        metabolite_xml_file = zf.namelist()[
            0
        ]  # assume there's only a single file inside the zip file
        f = zf.open(metabolite_xml_file)
    except zipfile.BadZipFile:  # oops not a zip file
        zf = None
        f = in_file

    # loops through file and extract the necessary element text to create a
    # DatabaseCompound
    db = xml.etree.ElementTree.parse(f).getroot()
    compounds = []
    prefix = "{http://www.hmdb.ca}"
    for metabolite_element in db:
        row = [None, None, None, None, None, None]
        for element in metabolite_element:
            if element.tag == (prefix + "name"):
                row[0] = element.text
            elif element.tag == (prefix + "chemical_formula"):
                row[1] = element.text
            elif element.tag == (prefix + "monisotopic_molecular_weight"):
                row[2] = element.text
            elif element.tag == (prefix + "smiles"):
                row[3] = element.text
            elif element.tag == (prefix + "inchi"):
                row[4] = element.text
            elif element.tag == (prefix + "inchikey"):
                row[5] = element.text

        # if all fields are present, then add them as a DatabaseCompound
        if None not in row:
            compound = DatabaseCompound(row[0], row[1], row[2], row[3], row[4], row[5])
            compounds.append(compound)
    logger.info("Loaded %d DatabaseCompounds from %s" % (len(compounds), in_file))

    f.close()
    if zf is not None:
        zf.close()

    if delete:
        logger.info("Deleting %s" % in_file)
        os.remove(in_file)

    return compounds

extract_roi

extract_roi(file_names, out_dir, pattern, mzml_path, rp=None)

Extract ROI for all mzML files listed in file_names, and turn them into Chemical objects.

Parameters:

Name Type Description Default
file_names

a list of mzML file names

required
out_dir

output directory to store pickled chemicals. If None, then the current directory is used

required
pattern

pattern for output file

required
mzml_path

input directory containing all the mzML files in file_names.

required
param_dict

dictionary of parameters

required

Returns: a list of extracted vimms.Chemicals.Chemical, one for each mzML file

Source code in vimms/FeatureExtraction.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def extract_roi(file_names, out_dir, pattern, mzml_path, rp=None):
    """
    Extract ROI for all mzML files listed in file_names, and turn them
    into Chemical objects.

    Args:
        file_names: a list of mzML file names
        out_dir: output directory to store pickled chemicals. If None,
                 then the current directory is used
        pattern: pattern for output file
        mzml_path: input directory containing all the mzML files in file_names.
        param_dict: dictionary of parameters

    Returns: a list of extracted [vimms.Chemicals.Chemical][], one for each mzML file

    """
    # extract ROI for all mzML files in file_names
    datasets = []
    for i in range(len(file_names)):

        # if mzml_path is provided, use that as the front part of filename
        if mzml_path is not None:
            mzml_file = os.path.join(mzml_path, file_names[i])
        else:
            mzml_file = file_names[i]

        if rp is None:
            rp = RoiBuilderParams()
        cm = ChemicalMixtureFromMZML(mzml_file, roi_params=rp)
        dataset = cm.sample(None, 2)
        datasets.append(dataset)

        # save extracted chemicals
        if out_dir is None:
            # if no out_dir provided, then same in the same location
            # as the mzML file
            dataset_name = os.path.splitext(mzml_file)[0] + ".p"
            save_obj(dataset, dataset_name)
        else:
            # else save the chemicals in our_dir, using pattern as the filename
            basename = os.path.basename(file_names[i])
            out_name = pattern % int(basename.split("_")[2])
            save_obj(dataset, os.path.join(out_dir, out_name))

    return datasets