from utils import *
import sys
import csv


class Trigrams:
    def __init__(self, name):
        self.name = name 
        self.trigram2index = {}
        self.index2trigram = {0: "SOS", 1: "EOS"}
        self.trigram2count = {}
        self.num_trigrams = 2 # initially set to 2, (sos, eos)
        
        with open('/raid2/darrak/FULLgenome/SARS-CoV-2_NucleotideData/ref_genome.csv', 'r') as read_obj:
            csv_reader = csv.reader(read_obj)
            row = next(csv_reader)
        NSP8 = "blank"
        print(len(NSP8))
        for i in row:
            #NSP1 = i[265:805]
            NSP9 = i[12685:13024]
            #NSP3 = i[2719:8554] 
            #NSP2 = i[805:2719]
            #NSP12 = i[13441:16236]
            #NSP6 = i[10972:11842] 
            #NSP8 = i[12091:12685]
            #NSP15 = i[19620:20658]
            #print(len(i[10054:10972]))
            #NSP5 = i[10054:10972]          # use METHOD_for_generating...txt file from desktop to extract nucleotide sequence for required protein of SARS-CoV-2 from complete reference genome

        self.refseq = NSP8

    def addSequence(self, seq): 
        for trigram in seq:
            self.addTrigram(trigram)
            
    def addTrigram(self,trigram):
        if trigram not in self.trigram2index:
            self.trigram2index[trigram] = self.num_trigrams
            self.index2trigram[self.num_trigrams] = trigram
            self.trigram2count[trigram] = 1
            self.num_trigrams += 1
        else:
            self.trigram2count[trigram] += 1

def prepareData(lang1, lang2, reverse = False):
    input_lang, output_lang, pairs = readTrigrams(lang1, lang2, reverse)
    print("Read sequence pairs : ", len(pairs))
    
    #pairs = filterPairs(pairs)
    print("Counting words")

    for i in range(1,len(pairs)-1):
        input_lang.addSequence(pairs[i][0])
        output_lang.addSequence(pairs[i][1]) 
    
    print("Trigrams : ")
    print("L1 Trigrams: ",input_lang.name, input_lang.num_trigrams)
    print("L2 Trigrams: ",output_lang.name, output_lang.num_trigrams) 
    print(output_lang.index2trigram)
    return input_lang, output_lang, pairs



def readTrigrams(lang1, lang2, reverse = False): 
    print("Reading Files . . . . ")

    lines = open('/raid2/darrak/FULLgenome/NSP9/Months/PairedTextFiles/Final_w_io.txt', encoding = 'utf-8').read().split('\n')
    # split the sequences into pairs 
    pairs = [[normalizeString2(s) for s in l.split('\t')] for l in lines]

    
    # reverse pairs, scnsjdncjsdang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs] 
        input_lang = Trigrams(lang1)
        output_lang = Trigrams(lang2)
        
    else:
        
        input_lang = Trigrams(lang1) 
        output_lang = Trigrams(lang2)
    
    return input_lang, output_lang, pairs
    