## Wet grinding of invertebrate bulk samples - a scalable and cost-efficient protocol for metabarcoding and metagenomics ## Supplementary Script 1 import pandas as pd import numpy as np ## read in the full dataset full_dataset = pd.read_excel('Mixer Test Read Table.xlsx') ## drop sequ column full_dataset = full_dataset.drop(['sequ'], axis = 1) ## melt the dataframe full_dataset = full_dataset.melt(id_vars = full_dataset.columns[:9], var_name = 'sample', value_name = 'readcount') ## replace nans with empty strings as a placeholder for grouping the data ## remove identifiers from sample names before grouping the data full_dataset = full_dataset.replace(np.nan, '') full_dataset['sample'] = full_dataset['sample'].str.split('_').str[:-1].str.join('_') ## replace 0 values with np.nan values to calculate the mean and counts only for valid data points full_dataset['readcount'] = full_dataset['readcount'].replace(0, np.nan) ## merge replicates with mean read number per replicate ## also calculate the rep count ## remove all samples that have a mean readcount > 0 and a repcount of 1 to keep negative controls full_dataset_merged = full_dataset.groupby(['ID', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Similarity', 'Status', 'sample']).mean().reset_index() full_dataset_merged['rep_count'] = full_dataset.groupby(['ID', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Similarity', 'Status', 'sample']).count().reset_index()['readcount'] full_dataset_merged = full_dataset_merged.loc[full_dataset_merged['rep_count'] != 1] full_dataset_merged['readcount'] = full_dataset_merged['readcount'].replace(np.nan, 0) full_dataset_merged = full_dataset_merged.drop(['rep_count'], axis = 1) ## pivot the table to generate a read table again full_dataset_merged = full_dataset_merged.pivot_table(values = 'readcount', index = ['ID', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Status', 'Similarity'], columns = 'sample').rename_axis(columns = None).reset_index() full_dataset_merged = full_dataset_merged.replace(np.nan, 0) ## resort OTU column full_dataset_merged['sort'] = full_dataset_merged['ID'].str.split('_').str[-1].astype('int32') full_dataset_merged = full_dataset_merged.sort_values(by = 'sort') full_dataset_merged = full_dataset_merged.drop('sort', axis = 1) ## remove maximum reads in all negative controls from the respective OTU in all samples ncs = full_dataset_merged[[col for col in full_dataset_merged.columns if col.startswith('NC')]].copy() ncs['max'] = ncs.max(axis = 1) ## remerge the dataset, drop negative controls full_dataset_merged = pd.concat([full_dataset_merged.iloc[:, :9], full_dataset_merged.iloc[:, 9:].sub(ncs['max'], axis = 0).clip(0)], axis = 1) full_dataset_merged = full_dataset_merged.drop([col for col in full_dataset_merged.columns if col.startswith('NC')], axis = 1) ## remove all OTUs that do not have reads left full_dataset_merged['readsum'] = full_dataset_merged.iloc[:, 9:].sum(axis = 1) full_dataset_merged = full_dataset_merged.loc[full_dataset_merged['readsum'] > 0] full_dataset_merged = full_dataset_merged.drop('readsum', axis = 1) ## save the cleaned dataset full_dataset_merged.to_excel('Mixer Test Read Table cleaned.xlsx', index = False)