## Wet grinding of invertebrate bulk samples - a scalable and cost-efficient protocol for metabarcoding and metagenomics
## Supplementary Script 1
import pandas as pd
import numpy as np
## read in the full dataset
full_dataset = pd.read_excel('Mixer Test Read Table.xlsx')

## drop sequ column
full_dataset = full_dataset.drop(['sequ'], axis = 1)

## melt the dataframe
full_dataset = full_dataset.melt(id_vars = full_dataset.columns[:9], var_name = 'sample', value_name = 'readcount')

## replace nans with empty strings as a placeholder for grouping the data
## remove identifiers from sample names before grouping the data
full_dataset = full_dataset.replace(np.nan, '')
full_dataset['sample'] = full_dataset['sample'].str.split('_').str[:-1].str.join('_')

## replace 0 values with np.nan values to calculate the mean and counts only for valid data points
full_dataset['readcount'] = full_dataset['readcount'].replace(0, np.nan)
## merge replicates with mean read number per replicate
## also calculate the rep count
## remove all samples that have a mean readcount > 0 and a repcount of 1 to keep negative controls
full_dataset_merged = full_dataset.groupby(['ID', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Similarity', 'Status', 'sample']).mean().reset_index()
full_dataset_merged['rep_count'] = full_dataset.groupby(['ID', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Similarity', 'Status', 'sample']).count().reset_index()['readcount']
full_dataset_merged = full_dataset_merged.loc[full_dataset_merged['rep_count'] != 1]
full_dataset_merged['readcount'] = full_dataset_merged['readcount'].replace(np.nan, 0)
full_dataset_merged = full_dataset_merged.drop(['rep_count'], axis = 1)

## pivot the table to generate a read table again
full_dataset_merged = full_dataset_merged.pivot_table(values = 'readcount', index = ['ID', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Status', 'Similarity'], columns = 'sample').rename_axis(columns = None).reset_index()
full_dataset_merged = full_dataset_merged.replace(np.nan, 0)

## resort OTU column
full_dataset_merged['sort'] = full_dataset_merged['ID'].str.split('_').str[-1].astype('int32')
full_dataset_merged = full_dataset_merged.sort_values(by = 'sort')
full_dataset_merged = full_dataset_merged.drop('sort', axis = 1)

## remove maximum reads in all negative controls from the respective OTU in all samples
ncs = full_dataset_merged[[col for col in full_dataset_merged.columns if col.startswith('NC')]].copy()
ncs['max'] = ncs.max(axis = 1)

## remerge the dataset, drop negative controls
full_dataset_merged = pd.concat([full_dataset_merged.iloc[:, :9], full_dataset_merged.iloc[:, 9:].sub(ncs['max'], axis = 0).clip(0)], axis = 1)
full_dataset_merged = full_dataset_merged.drop([col for col in full_dataset_merged.columns if col.startswith('NC')], axis = 1)

## remove all OTUs that do not have reads left
full_dataset_merged['readsum'] = full_dataset_merged.iloc[:, 9:].sum(axis = 1)
full_dataset_merged = full_dataset_merged.loc[full_dataset_merged['readsum'] > 0]
full_dataset_merged = full_dataset_merged.drop('readsum', axis = 1)

## save the cleaned dataset
full_dataset_merged.to_excel('Mixer Test Read Table cleaned.xlsx', index = False)