Source code for fastq2bcl.parser
import re
import logging
_logger = logging.getLogger(__name__)
[docs]
def parse_seqdesc_fields(txt):
"""
Parse the SeqIO description field using named groups.
"""
regxp = re.compile(
r"(?P<instrument>[A-Za-z0-9_]+):"
+ r"(?P<run_number>[0-9]+):"
+ r"(?P<flowcell_id>[A-Za-z0-9-]+):"
+ r"(?P<lane>[0-9]+):"
+ r"(?P<tile>[0-9]+):?"
+ r"(?P<x_pos>[0-9]+)?:?"
+ r"(?P<y_pos>[0-9]+)?:?"
+ r"(?P<UMI>[A-Z-]+)?"
+ r"\s"
+ r"(?P<read>[0-9]+):"
+ r"(?P<is_filtered>[YN]+):"
+ r"(?P<control_number>[0-9]+):"
+ r"(?P<index>[0-9A-Z+]+)"
)
match = re.match(regxp, txt)
if not match:
raise ValueError(f"Sequence identifier not recognized: {txt}")
return validate_fields(match.groupdict())
[docs]
def validate_fields(fields):
"""
Validate the fields extracted from SeqIO description
"""
valid_keys = [
"instrument",
"run_number",
"flowcell_id",
"lane",
"tile",
"x_pos",
"y_pos",
"UMI",
"read",
"is_filtered",
"control_number",
"index",
]
_logger.debug(f"Verifying keys ...")
for key in valid_keys:
if not fields[key]:
if key == "UMI" and fields[key] == None:
_logger.debug(f"Found None value for optional key {key}. This is ok.")
else:
raise ValueError(f"Requested Key {key} not Found in fastq description")
return fields