/* $Id: BioSource.cpp 697272 2025-05-08 11:59:16Z ivanov $
 * ===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 *
 * Author:  .......
 *
 * File Description:
 *   .......
 *
 * Remark:
 *   This code was originally generated by application DATATOOL
 *   using specifications from the data definition file
 *   'seqfeat.asn'.
 */

// standard includes

// generated includes
#include <ncbi_pch.hpp>
#include <objects/seqfeat/BioSource.hpp>
#include <objects/seqfeat/Org_ref.hpp>
#include <objects/seqfeat/OrgName.hpp>
#include <objects/seqfeat/OrgMod.hpp>
#include <objects/seqfeat/SubSource.hpp>
#include <objects/seqfeat/PCRReactionSet.hpp>
#include <algorithm>
#include <set>
#include <util/static_map.hpp>
#include <corelib/ncbistr.hpp>
#include <corelib/ncbistre.hpp>
#include <corelib/ncbienv.hpp>
#include <corelib/ncbiobj.hpp>
#include <corelib/ncbi_limits.h>
#include <memory>
#include <set>
#include <list>
#include <vector>


// generated classes

BEGIN_NCBI_SCOPE

BEGIN_objects_SCOPE // namespace ncbi::objects::

// destructor
CBioSource::~CBioSource(void)
{
}


int CBioSource::GetGenCode(int def) const
{
    try {
        TGenome genome = CanGetGenome() ? GetGenome() : eGenome_unknown;

        if ( !CanGetOrg()  ||  !GetOrg().CanGetOrgname() ) {
            return def; // assume standard genetic code
        }
        const COrgName& orn = GetOrg().GetOrgname();

        switch ( genome ) {
        case eGenome_kinetoplast:
        case eGenome_mitochondrion:
        case eGenome_hydrogenosome:
        case eGenome_plasmid_in_mitochondrion:
            {
                // mitochondrial code
                if (orn.IsSetMgcode()) {
                    return orn.GetMgcode();
                }
                return def;
            }
        case eGenome_chloroplast:
        case eGenome_chromoplast:
        case eGenome_plastid:
        case eGenome_cyanelle:
        case eGenome_apicoplast:
        case eGenome_leucoplast:
        case eGenome_proplastid:
        case eGenome_chromatophore:
        case eGenome_plasmid_in_plastid:
            {
                // bacteria and plant plastid code
                if (orn.IsSetPgcode()) {
                    int pgcode = orn.GetPgcode();
                    if (pgcode > 0) return pgcode;
                }
                // bacteria and plant plastids default to code 11.
                return 11;
            }
        default:
            {
                if (orn.IsSetGcode()) {
                    return orn.GetGcode();
                }
                return def;
            }
        }
    } catch (...) {
        return def; // was 0(!)
    }
}

typedef SStaticPair<const char *, CBioSource::EGenome> TGenomeKey;

static const TGenomeKey genome_key_to_subtype [] = {
    {  "apicoplast",                CBioSource::eGenome_apicoplast        },
    {  "chloroplast",               CBioSource::eGenome_chloroplast       },
    {  "chromatophore",             CBioSource::eGenome_chromatophore     },
    {  "chromoplast",               CBioSource::eGenome_chromoplast       },
    {  "chromosome",                CBioSource::eGenome_chromosome        },
    {  "cyanelle",                  CBioSource::eGenome_cyanelle          },
    {  "endogenous virus",          CBioSource::eGenome_endogenous_virus  },
    {  "endogenous_virus",          CBioSource::eGenome_endogenous_virus  },
    {  "extrachrom",                CBioSource::eGenome_extrachrom        },
    {  "extrachromosomal",          CBioSource::eGenome_extrachrom        },
    {  "genomic",                   CBioSource::eGenome_genomic           },
    {  "hydrogenosome",             CBioSource::eGenome_hydrogenosome     },
    {  "insertion_seq",             CBioSource::eGenome_insertion_seq     },
    {  "kinetoplast",               CBioSource::eGenome_kinetoplast       },
    {  "leucoplast",                CBioSource::eGenome_leucoplast        },
    {  "macronuclear",              CBioSource::eGenome_macronuclear      },
    {  "mitochondrion",             CBioSource::eGenome_mitochondrion     },
    {  "mitochondrion:kinetoplast", CBioSource::eGenome_kinetoplast       },
    {  "nucleomorph",               CBioSource::eGenome_nucleomorph       },
    {  "plasmid",                   CBioSource::eGenome_plasmid           },
    {  "plastid",                   CBioSource::eGenome_plastid           },
    {  "plastid:apicoplast",        CBioSource::eGenome_apicoplast        },
    {  "plastid:chloroplast",       CBioSource::eGenome_chloroplast       },
    {  "plastid:chromatophore",     CBioSource::eGenome_chromatophore     },
    {  "plastid:chromoplast",       CBioSource::eGenome_chromoplast       },
    {  "plastid:cyanelle",          CBioSource::eGenome_cyanelle          },
    {  "plastid:leucoplast",        CBioSource::eGenome_leucoplast        },
    {  "plastid:proplastid",        CBioSource::eGenome_proplastid        },
    {  "proplastid",                CBioSource::eGenome_proplastid        },
    {  "proviral",                  CBioSource::eGenome_proviral          },
    {  "transposon",                CBioSource::eGenome_transposon        },
    {  "unknown",                   CBioSource::eGenome_unknown           },
    {  "virion",                    CBioSource::eGenome_virion            }
};


typedef CStaticPairArrayMap <const char*, CBioSource::EGenome, PNocase_CStr> TGenomeMap;
DEFINE_STATIC_ARRAY_MAP(TGenomeMap, sm_GenomeKeys, genome_key_to_subtype);

CBioSource::EGenome CBioSource::GetGenomeByOrganelle (const string& organelle, NStr::ECase use_case, bool starts_with)
{
    CBioSource::EGenome gtype = CBioSource::eGenome_unknown;

    if (use_case == NStr::eCase && !starts_with) {
        TGenomeMap::const_iterator g_iter = sm_GenomeKeys.find (organelle.c_str ());
        if (g_iter == sm_GenomeKeys.end()) {
            if (NStr::Equal(organelle, "mitochondrial")) {
                gtype = CBioSource::eGenome_mitochondrion;
            }
        } else {
            gtype = g_iter->second;
        }
    } else {
        TGenomeMap::const_iterator g_iter = sm_GenomeKeys.begin();
        if (starts_with) {
            if (NStr::StartsWith(organelle, "mitochondrial", use_case)){
                gtype = CBioSource::eGenome_mitochondrion;
            } else {
                string match;
                while (g_iter != sm_GenomeKeys.end() && gtype == CBioSource::eGenome_unknown) {
                    match = g_iter->first;
                    if (NStr::StartsWith(organelle, match.c_str(), use_case)) {
                        if (organelle.length() == match.length()
                            || (match.length() < organelle.length() && isspace(organelle[match.length()]))) {
                            gtype = g_iter->second;
                        }
                    }
                    ++g_iter;
                }
            }
        } else {
            if (NStr::Equal(organelle, "mitochondrial", use_case)) {
                gtype = CBioSource::eGenome_mitochondrion;
            } else {
                while (g_iter != sm_GenomeKeys.end() && gtype == CBioSource::eGenome_unknown) {
                    if (NStr::Equal(organelle, g_iter->first, use_case)) {
                        gtype = g_iter->second;
                    }
                    ++g_iter;
                }
            }
        }
    }
    return gtype;
}


string CBioSource::GetOrganelleByGenome (unsigned int genome)
{
    string organelle = kEmptyStr;
    TGenomeMap::const_iterator g_iter = sm_GenomeKeys.begin();
    while (g_iter != sm_GenomeKeys.end() &&
           unsigned(g_iter->second) != genome) {
        ++g_iter;
    }
    if (g_iter != sm_GenomeKeys.end()) {
        organelle = g_iter->first;
    }
    return organelle;
}


typedef SStaticPair<const char *, CBioSource::EOrigin> TOriginKey;

static const TOriginKey origin_key_to_subtype [] = {
    {  "artificial",                CBioSource::eOrigin_artificial    },
    {  "mutant",                    CBioSource::eOrigin_mut           },
    {  "natural",                   CBioSource::eOrigin_natural       },
    {  "natural mutant",            CBioSource::eOrigin_natmut        },
    {  "other",                     CBioSource::eOrigin_other         },
    {  "synthetic",                 CBioSource::eOrigin_synthetic     },
    {  "unknown",                   CBioSource::eOrigin_unknown       }
};

static const TOriginKey origin_synonyms [] = {
    {  "mut",                       CBioSource::eOrigin_mut           },
    {  "nat mut",                   CBioSource::eOrigin_natmut        },
    {  "natmut",                    CBioSource::eOrigin_natmut        }
};


typedef CStaticPairArrayMap <const char*, CBioSource::EOrigin, PNocase_CStr> TOriginMap;
DEFINE_STATIC_ARRAY_MAP(TOriginMap, sm_OriginKeys, origin_key_to_subtype);
DEFINE_STATIC_ARRAY_MAP(TOriginMap, sm_OriginSynonyms, origin_synonyms);

CBioSource::EOrigin CBioSource::GetOriginByString (const string& origin, NStr::ECase use_case, bool starts_with)
{
    CBioSource::EOrigin gtype = CBioSource::eOrigin_unknown;

    if (use_case == NStr::eCase && !starts_with) {
        TOriginMap::const_iterator g_iter = sm_OriginKeys.find (origin.c_str ());
        if (g_iter == sm_OriginKeys.end ()) {
            g_iter = sm_OriginSynonyms.find (origin.c_str());
            if (g_iter != sm_OriginSynonyms.end ()) {
                gtype = g_iter->second;
            }
        } else {
            gtype = g_iter->second;
        }
    } else {
        TOriginMap::const_iterator g_iter = sm_OriginKeys.begin();
        bool found = false;
        if (starts_with) {
            string match;
            while (g_iter != sm_OriginKeys.end() && !found) {
                match = g_iter->first;
                if (NStr::StartsWith(origin, match.c_str(), use_case)) {
                    if (origin.length() == match.length()
                        || (match.length() < origin.length() && isspace (origin[match.length()]))) {
                        gtype = g_iter->second;
                        found = true;
                    }
                }
                ++g_iter;
            }
            if (!found) {
                g_iter = sm_OriginSynonyms.begin();
                while (g_iter != sm_OriginSynonyms.end() && !found) {
                    match = g_iter->first;
                    if (NStr::StartsWith(origin, match.c_str(), use_case)) {
                        if (origin.length() == match.length()
                            || (match.length() < origin.length() && isspace (origin[match.length()]))) {
                            gtype = g_iter->second;
                            found = true;
                        }
                    }
                    ++g_iter;
                }
            }
        } else {
            while (g_iter != sm_OriginKeys.end() && !found) {
                if (NStr::Equal(origin, g_iter->first, use_case)) {
                    gtype = g_iter->second;
                }
                ++g_iter;
            }
            if (!found) {
                g_iter = sm_OriginSynonyms.begin();
                while (g_iter != sm_OriginSynonyms.end() && !found) {
                    if (NStr::Equal(origin, g_iter->first, use_case)) {
                        gtype = g_iter->second;
                    }
                    ++g_iter;
                }
            }
        }
    }
    return gtype;
}


string CBioSource::GetStringFromOrigin (unsigned int origin)
{
    string origin_str = "";
    TOriginMap::const_iterator g_iter = sm_OriginKeys.begin();
    while (g_iter != sm_OriginKeys.end() &&
           unsigned(g_iter->second) != origin) {
        ++g_iter;
    }
    if (g_iter != sm_OriginKeys.end()) {
        origin_str = g_iter->first;
    }
    return origin_str;
}


bool CBioSource::IsSetTaxname(void) const
{
    return IsSetOrg () && GetOrg ().IsSetTaxname ();
}

const string& CBioSource::GetTaxname(void) const
{
    return GetOrg ().GetTaxname ();
}

bool CBioSource::IsSetCommon(void) const
{
    return IsSetOrg () && GetOrg ().IsSetCommon ();
}

const string& CBioSource::GetCommon(void) const
{
    return GetOrg ().GetCommon ();
}

bool CBioSource::IsSetLineage(void) const
{
    return IsSetOrg () && GetOrg ().IsSetLineage ();
}

const string& CBioSource::GetLineage(void) const
{
    return GetOrg ().GetLineage ();
}

bool CBioSource::IsSetGcode(void) const
{
    return IsSetOrg () && GetOrg ().IsSetGcode ();
}

int CBioSource::GetGcode(void) const
{
    return GetOrg ().GetGcode ();
}

bool CBioSource::IsSetMgcode(void) const
{
    return IsSetOrg () && GetOrg ().IsSetMgcode ();
}

int CBioSource::GetMgcode(void) const
{
    return GetOrg ().GetMgcode ();
}

bool CBioSource::IsSetPgcode(void) const
{
    return IsSetOrg () && GetOrg ().IsSetPgcode ();
}

int CBioSource::GetPgcode(void) const
{
    return GetOrg ().GetPgcode ();
}

bool CBioSource::IsSetDivision(void) const
{
    return IsSetOrg () && GetOrg ().IsSetDivision ();
}

const string& CBioSource::GetDivision(void) const
{
    return GetOrg ().GetDivision ();
}

bool CBioSource::IsSetOrgname(void) const
{
    return IsSetOrg () && GetOrg ().IsSetOrgname ();
}

const COrgName& CBioSource::GetOrgname(void) const
{
    return GetOrg ().GetOrgname ();
}

bool CBioSource::IsSetOrgMod(void) const
{
  return IsSetOrg () && GetOrg ().IsSetOrgMod ();
}


string CBioSource::GetRepliconName(void) const
{
    string bioprojtype = GetBioprojectType();
    ITERATE (CBioSource::TSubtype, sit, GetSubtype()) {
        if ((*sit)->IsSetSubtype() && (*sit)->IsSetName()){
            CSubSource_Base::TSubtype subtype=(*sit)->GetSubtype();
            string name  =(*sit)->GetName();
            switch(subtype){
                case CSubSource::eSubtype_plasmid_name:
                case CSubSource::eSubtype_chromosome:
                case CSubSource::eSubtype_plastid_name:
                case CSubSource::eSubtype_endogenous_virus_name:
                    return name;
                    break;
                case CSubSource::eSubtype_linkage_group:
                    if(IsSetGenome() &&
                       GetGenome() == CBioSource::eGenome_chromosome){
                        return name;
                    }
                    break;
                case CSubSource::eSubtype_segment:
                    if(bioprojtype == "eSegment")
                        return  name;
                    break;
            }
        }
    }

    // no other name found
    if (IsSetGenome()) {
        switch (GetGenome()) {
            case CBioSource::eGenome_plasmid:
            case CBioSource::eGenome_plasmid_in_mitochondrion:
            case CBioSource::eGenome_plasmid_in_plastid:
                return "unnamed";
                break;
            case CBioSource::eGenome_chromosome:
                return "ANONYMOUS";
                break;
            case CBioSource::eGenome_kinetoplast:
                return "kinetoplast";
                break;
            case CBioSource::eGenome_plastid:
            case CBioSource::eGenome_chloroplast:
            case CBioSource::eGenome_chromoplast:
            case CBioSource::eGenome_apicoplast:
            case CBioSource::eGenome_leucoplast:
            case CBioSource::eGenome_proplastid:
            case CBioSource::eGenome_chromatophore:
                return "Pltd";
                break;
            case CBioSource::eGenome_mitochondrion:
            case CBioSource::eGenome_hydrogenosome:
                return "MT";
                break;
        }
    }
    return kEmptyStr;
}


string CBioSource::GetBioprojectType (void) const
{
    if (IsSetGenome()) {
        switch (GetGenome()) {
            case CBioSource::eGenome_plasmid:
            case CBioSource::eGenome_plasmid_in_mitochondrion:
            case CBioSource::eGenome_plasmid_in_plastid:
                return "ePlasmid";
                break;
            case CBioSource::eGenome_extrachrom:
                return "eExtrachrom";
                break;
        }
    }

    ITERATE (CBioSource::TSubtype, sit, GetSubtype()) {
        if ((*sit)->IsSetSubtype() && (*sit)->GetSubtype() == CSubSource::eSubtype_plasmid_name) {
            return "ePlasmid";
        }
    }

    if (IsSetGenome() && GetGenome() == CBioSource::eGenome_chromosome) {
        ITERATE (CBioSource::TSubtype, sit, GetSubtype()) {
            if ((*sit)->IsSetSubtype() && (*sit)->GetSubtype() == CSubSource::eSubtype_linkage_group) {
                return "eLinkageGroup";
            }
        }
    }

    if (IsSetOrg() && GetOrg().IsSetLineage()) {
        const string& lineage = GetOrg().GetLineage();
        if (NStr::FindNoCase(lineage, "viruses") != string::npos ||
            NStr::FindNoCase(lineage, "viroids") != string::npos) {
            return "eSegment";
        }
    }
    return "eChromosome";
}


string CBioSource::GetBioprojectLocation(void) const
{
    if (IsSetGenome() && GetGenome() == CBioSource::eGenome_chromosome) {
        return "eNuclearProkaryote";
    }

    const string& bioprojecttype = GetBioprojectType();
    if (NStr::Equal(bioprojecttype, "eSegment")) {
        if (IsSetOrg() && GetOrg().IsSetLineage()) {
            const string& lineage = GetOrg().GetLineage();
            if (NStr::FindNoCase(lineage, "viruses") != string::npos) {
                return "eVirionPhage";
            } else if (NStr::FindNoCase(lineage, "viroids") != string::npos) {
                return "eViroid";
            }
        }
        return "eOther";
    }

    if (!IsSetGenome()) {
        return "eNuclearProkaryote";
    } else {
        switch (GetGenome()) {
            case CBioSource::eGenome_unknown:
            case CBioSource::eGenome_genomic:
            case CBioSource::eGenome_chromosome:
            case CBioSource::eGenome_plasmid:
            case CBioSource::eGenome_extrachrom:
                return "eNuclearProkaryote";
                break;
            case CBioSource::eGenome_mitochondrion:
                return "eMitochondrion";
                break;
            case CBioSource::eGenome_kinetoplast:
                return "eKinetoplast";
                break;
            case CBioSource::eGenome_chloroplast:
                return "eChloroplast";
                break;
            case CBioSource::eGenome_chromoplast:
                return "eChromoplast";
                break;
            case CBioSource::eGenome_plastid:
                return "ePlastid";
                break;
            case CBioSource::eGenome_macronuclear:
                return "eMacronuclear";
                break;
            case CBioSource::eGenome_cyanelle:
                return "eCyanelle";
                break;
            case CBioSource::eGenome_proviral:
            case CBioSource::eGenome_endogenous_virus:
                return "eProviralProphage";
                break;
            case CBioSource::eGenome_virion:
                if (IsSetOrg() && GetOrg().IsSetLineage()) {
                    const string& lineage = GetOrg().GetLineage();
                    if (NStr::FindNoCase(lineage, "viruses") != string::npos) {
                        return "eVirionPhage";
                    } else if (NStr::FindNoCase(lineage, "viroids") != string::npos) {
                        return "eViroid";
                    }
                }
                return "eOther";
                break;
            case CBioSource::eGenome_nucleomorph:
                return "eNucleomorph";
                break;
            case CBioSource::eGenome_apicoplast:
                return "eApicoplast";
                break;
            case CBioSource::eGenome_leucoplast:
                return "eLeucoplast";
                break;
            case CBioSource::eGenome_proplastid:
                return "eProplastid";
                break;
            case CBioSource::eGenome_hydrogenosome:
                return "eHydrogenosome";
                break;
            case CBioSource::eGenome_chromatophore:
                return "eChromatophore";
                break;
            case CBioSource::eGenome_transposon:
            case CBioSource::eGenome_insertion_seq:
                return "eOther";
                break;
        }
    }
    if (!NStr::Equal(bioprojecttype, "eSegment")) {
        return "eNuclearProkaryote";
    }

    return "eOther";
}

static const char* kDisableStrainForwardAttrib = "nomodforward";

void CBioSource::SetDisableStrainForwarding(bool val)
{
    if (val) {
        string attrib = kEmptyStr;
        if (IsSetOrg() && GetOrg().IsSetOrgname() && GetOrg().GetOrgname().IsSetAttrib()) {
            attrib = GetOrg().GetOrgname().GetAttrib();
        }
        if (NStr::Find(attrib, kDisableStrainForwardAttrib) == string::npos) {
            if (!NStr::IsBlank(attrib)) {
                attrib += ";";
            }
            attrib += kDisableStrainForwardAttrib;
            SetOrg().SetOrgname().SetAttrib(attrib);
        }
    } else {
        if (IsSetOrg() && GetOrg().IsSetOrgname() && GetOrg().GetOrgname().IsSetAttrib()) {
            NStr::ReplaceInPlace(SetOrg().SetOrgname().SetAttrib(), kDisableStrainForwardAttrib, "");
            NStr::ReplaceInPlace(SetOrg().SetOrgname().SetAttrib(), ";;", "");
            if (NStr::IsBlank(GetOrg().GetOrgname().GetAttrib())) {
                SetOrg().SetOrgname().ResetAttrib();
            }
        }
    }
}


bool CBioSource::GetDisableStrainForwarding() const
{
    bool val = false;
    if (IsSetOrg() && GetOrg().IsSetOrgname() && GetOrg().GetOrgname().IsSetAttrib()
        && NStr::Find(GetOrg().GetOrgname().GetAttrib(), kDisableStrainForwardAttrib) != string::npos) {
        val = true;
    }
    return val;
}


bool s_MustCopy (int subtype)
{
    if (CSubSource::IsDiscouraged(subtype)) {
        return false;
    } else if (subtype == CSubSource::eSubtype_chromosome
               || subtype == CSubSource::eSubtype_map
               || subtype == CSubSource::eSubtype_plasmid_name
               || subtype == CSubSource::eSubtype_other) {
        return false;
    }
    return true;
}


void CBioSource::x_RemoveStopWords(COrg_ref& org_ref)
{
    if (org_ref.IsSetTaxname() && IsStopWord(org_ref.GetTaxname())) {
        org_ref.ResetTaxname();
    }
    if (org_ref.IsSetOrgMod()) {
        COrgName::TMod::iterator it = org_ref.SetOrgname().SetMod().begin();
        while (it != org_ref.SetOrgname().SetMod().end()) {
            if (IsStopWord((*it)->GetSubname())) {
                it = org_ref.SetOrgname().SetMod().erase(it);
            } else {
                it++;
            }
        }
        if (org_ref.GetOrgname().GetMod().empty()) {
            org_ref.SetOrgname().ResetMod();
        }
    }
}


bool CBioSource::BiosampleDiffsOkForUpdate(const TFieldDiffList& diffs) const
{
    ITERATE(TFieldDiffList, it, diffs) {
        if (!NStr::IsBlank((*it)->GetSrcVal())) {
            return false;
        }
    }
    return true;
}


void CBioSource::UpdateWithBioSample(const CBioSource& biosample, bool force, bool is_local_copy)
{
    TFieldDiffList diffs = GetBiosampleDiffs(biosample, is_local_copy);
    if (!force && !BiosampleDiffsOkForUpdate(diffs)) {
        // throw exception
        NCBI_THROW(CException, eUnknown, "Conflicts found");
    }

    COrgName_Base::TMod mods;
    CBioSource_Base::TSubtype subtypes;

    ITERATE(TFieldDiffList, it, diffs) {
        string label = (*it)->GetFieldName();
    bool skipStopWord = true;
    if (NStr::EqualNocase(label, "collection-date") || NStr::EqualNocase(label, "country") || NStr::EqualNocase(label, "geo-loc-name")) {
        skipStopWord = false;
    }
        if (NStr::EqualNocase((*it)->GetFieldName(), "Organism Name")) {
            SetOrg().SetTaxname((*it)->GetSampleVal());
            if (GetOrg().IsSetOrgname() && GetOrg().GetOrgname().IsSetName()) {
                SetOrg().SetOrgname().ResetName();
            }
            RemoveOrgMod(COrgMod::eSubtype_old_name);
        } else if (NStr::EqualNocase((*it)->GetFieldName(), "Tax ID")) {
            try {
                SetOrg().SetTaxId(TAX_ID_FROM(int, atoi((*it)->GetSampleVal().c_str())));
            } catch (...) {
                NCBI_THROW(CException, eUnknown, "Non-integer Tax ID value");
            }
        } else {
            string sample_val = (*it)->GetSampleVal();
            if (skipStopWord && IsStopWord(sample_val)) {
                sample_val = "";
            }
            try {
                COrgMod::TSubtype subtype = COrgMod::GetSubtypeValue((*it)->GetFieldName());
                if (!NStr::IsBlank((*it)->GetSrcVal())) {
                    RemoveOrgMod(subtype, (*it)->GetSrcVal());
                }
                if (!NStr::IsBlank(sample_val)) {
                    CRef<COrgMod> mod(new COrgMod());
                    mod->SetSubtype(subtype);
                    mod->SetSubname(sample_val);
                    mods.push_back(mod);
                }
            } catch (...) {
                try {
                    CSubSource::TSubtype subtype = CSubSource::GetSubtypeValue((*it)->GetFieldName());
                    if (CSubSource::NeedsNoText(subtype)) {
                        // process diff that involve NeedsNoText subtypes
                        if (NStr::EqualNocase((*it)->GetSrcVal(), "true")) {
                            RemoveSubSource(subtype);
                        }
                        if (NStr::EqualNocase(sample_val, "true")) {
                            CRef<CSubSource> sub(new CSubSource());
                            sub->SetSubtype(subtype);
                            sub->SetName("");
                            subtypes.push_back(sub);
                        }
                    }
                    else {
                        // process all other subtypes
                        if (!NStr::IsBlank((*it)->GetSrcVal())) {
                            RemoveSubSource(subtype, (*it)->GetSrcVal());
                        }
                        if (!NStr::IsBlank(sample_val)) {
                            CRef<CSubSource> sub(new CSubSource());
                            sub->SetSubtype(subtype);
                            sub->SetName(sample_val);
                            subtypes.push_back(sub);
                        }
                    }
                } catch (...) {
                    NCBI_THROW(CException, eUnknown, "Unknown field name");
                }
            }
        }
    }

    if (!mods.empty()) {
        SetOrg().SetOrgname().SetMod().splice(SetOrg().SetOrgname().SetMod().end(), mods);
    }

    if (!subtypes.empty()) {
        SetSubtype().splice(SetSubtype().end(), subtypes);
    }

    AutoFix();
}


void CBioSource::x_ClearCoordinatedBioSampleSubSources()
{
    if (!IsSetSubtype()) {
        return;
    }
    CBioSource::TSubtype::iterator it = SetSubtype().begin();
    while (it != SetSubtype().end()) {
        if (s_MustCopy((*it)->GetSubtype())) {
            it = SetSubtype().erase(it);
        } else {
            ++it;
        }
    }
}


static const char* kOrgModNote   = "orgmod_note";
static const char* kSubSrcNote   = "subsrc_note";
static const char* kOrganismName = "Organism Name";
static const char* kTaxId        = "Tax ID";

CBioSource::TNameValList CBioSource::GetNameValPairs() const
{
    TNameValList list;

    if (IsSetOrg() && GetOrg().IsSetTaxname()) {
        list.push_back(TNameVal(kOrganismName, GetOrg().GetTaxname()));
    }
    if (IsSetOrg()) {
        TTaxId taxid = GetOrg().GetTaxId();
        if (taxid > ZERO_TAX_ID) {
            try {
                string val = NStr::NumericToString(taxid);
                list.push_back(TNameVal(kTaxId, val));
            } catch (...) {
            }
        }
    }

    TNameValList extra = x_GetOrgModNameValPairs();
    ITERATE(TNameValList, it, extra) {
        list.push_back(*it);
    }
    extra = x_GetSubtypeNameValPairs();
    ITERATE(TNameValList, it, extra) {
        list.push_back(*it);
    }

    return list;
}


int s_iCompareNameVals (const CBioSource::TNameVal& f1, const CBioSource::TNameVal& f2)
{
    int cmp = NStr::Compare (f1.first, f2.first);
    if (cmp == 0) {
        bool stop1 = CBioSource::IsStopWord(f1.second);
        bool stop2 = CBioSource::IsStopWord(f2.second);
        if (stop1 && stop2) {
            // equal
            cmp = 0;
        } else if (stop1) {
            // first is less
            cmp = -1;
        } else if (stop2) {
            // second is less
            cmp = 1;
        } else {
            cmp = NStr::CompareNocase (f1.second, f2.second);
            if (cmp == 0) {
                cmp = NStr::Compare(f1.second, f2.second);
            }
        }
    }
    return cmp;
}


bool s_CompareNameVals (const CBioSource::TNameVal& f1, const CBioSource::TNameVal& f2)
{
    int cmp = s_iCompareNameVals (f1, f2);
    if (cmp < 0) {
        return true;
    } else {
        return false;
    }
}


CBioSource::TNameValList CBioSource::x_GetOrgModNameValPairs() const
{
    TNameValList list;
    if (IsSetOrgMod()) {
        ITERATE(COrgName::TMod, it, GetOrg().GetOrgname().GetMod()) {
            if ((*it)->IsSetSubname() && (*it)->IsSetSubtype()) {
                string label;
                if ((*it)->GetSubtype() == COrgMod::eSubtype_other) {
                    label = kOrgModNote;
                } else {
                    label = COrgMod::GetSubtypeName((*it)->GetSubtype());
                }
                list.push_back(TNameVal(label, (*it)->GetSubname()));
            }
        }
    }
    sort(list.begin(), list.end(), s_CompareNameVals);
    return list;
}


CBioSource::TNameValList CBioSource::x_GetSubtypeNameValPairs() const
{
    TNameValList list;
    if (IsSetSubtype()) {
        ITERATE(CBioSource::TSubtype, it, GetSubtype()) {
            if ((*it)->IsSetName() && (*it)->IsSetSubtype()) {
                CSubSource::TSubtype st = (*it)->GetSubtype();
                string label;
                if (st == CSubSource::eSubtype_other) {
                    label = kSubSrcNote;
                } else {
                    label = CSubSource::GetSubtypeName(st);
                }
                string val = (*it)->GetName();
                if (CSubSource::NeedsNoText(st) && NStr::IsBlank(val)) {
                    val = "true";
                }
                list.push_back(TNameVal(label, val));
            }
        }
    }
    sort(list.begin(), list.end(), s_CompareNameVals);
    return list;
}


static const char* const s_IgnoreCaseQuals[] = {
    "cell-type",
    "collected-by",
    "dev-stage",
    "frequency",
    "group",
    "identified-by",
    "isolation-source",
    "map",
    "metagenome-source",
    "note",
    "phenotype",
    "sex",
    "subgroup",
    "tissue-type"
};

typedef CStaticArraySet<const char*, PNocase_CStr> TCIgnoreCaseQualsSet;
static const TCIgnoreCaseQualsSet s_IgnoreCaseQualsSet(s_IgnoreCaseQuals, sizeof(s_IgnoreCaseQuals), __FILE__, __LINE__);

bool s_MayIgnoreCase(const string& value)
{
    return s_IgnoreCaseQualsSet.find(value.c_str()) != s_IgnoreCaseQualsSet.end();
}


static const char* const s_TaxNameElementQuals[] = {
    "biovar",
    "chemovar",
    "forma",
    "forma-specialis",
    "genotype",
    "pathovar",
    "serotype",
    "serovar",
    "subspecies",
    "variety"
};

typedef CStaticArraySet<const char*, PNocase_CStr> TCTaxNameElementQualsSet;
static const TCTaxNameElementQualsSet s_TaxNameElementQualsSet(s_TaxNameElementQuals, sizeof(s_TaxNameElementQuals), __FILE__, __LINE__);

bool s_IsTaxNameElement(const string& value)
{
    return s_TaxNameElementQualsSet.find(value.c_str()) != s_TaxNameElementQualsSet.end();
}


typedef enum {
    eConflictIgnoreAll = 0,
    eConflictIgnoreMissingInBioSource,
    eConflictIgnoreMissingInBioSample
} EConflictIgnoreType;


typedef struct ignoreconflict {
    const char*         qual_name;
    EConflictIgnoreType ignore_type;
} IgnoreConflictData;


static IgnoreConflictData sIgnoreConflictList[] = {
    { "chromosome", eConflictIgnoreMissingInBioSample } ,
    { "endogenous-virus-name", eConflictIgnoreMissingInBioSample } ,
    { "germline", eConflictIgnoreMissingInBioSample } ,
    { "insertion-seq-name", eConflictIgnoreMissingInBioSample } ,
    { "linkage-group", eConflictIgnoreMissingInBioSample } ,
    { "map", eConflictIgnoreMissingInBioSample } ,
    { "plasmid-name", eConflictIgnoreMissingInBioSample } ,
    { "pop-variant", eConflictIgnoreMissingInBioSample } ,
    { "rearranged", eConflictIgnoreMissingInBioSample } ,
    { "segment", eConflictIgnoreMissingInBioSample } ,
    { "transgenic", eConflictIgnoreMissingInBioSample } ,
    { "transposon-name", eConflictIgnoreMissingInBioSample } ,
    { "whole-replicon", eConflictIgnoreMissingInBioSample } ,
    { "acronym", eConflictIgnoreAll },
    { "common", eConflictIgnoreAll } ,
    { "dosage", eConflictIgnoreAll } ,
    { "gb-acronym", eConflictIgnoreAll } ,
    { "gb-anamorph", eConflictIgnoreAll } ,
    { "gb-synonym", eConflictIgnoreAll } ,
    { "lineage", eConflictIgnoreAll } ,
    { "old-lineage", eConflictIgnoreAll } ,
    { "old-name", eConflictIgnoreAll } ,
    { "synonym", eConflictIgnoreAll } ,
    { "type-material", eConflictIgnoreAll },
    { "StructuredCommentPrefix", eConflictIgnoreAll} ,
    { "StructuredCommentSuffix", eConflictIgnoreAll}
};


bool s_SameExceptPrecision (double val1, double val2)
{
    if (val1 > 180.0 || val2 > 180.0) {
        return false;
    }
    char buf1[20];
    char buf2[20];
    sprintf(buf1, "%0.2f", val1);
    sprintf(buf2, "%0.2f", val2);
    if (strcmp(buf1, buf2) == 0) {
        return true;
    }
    return false;
}


bool CBioSource::ShouldIgnoreConflict(const string& label, string src_val, string sample_val, bool is_local_copy)
{
    size_t i;
    bool rval = false;

    bool skipStopWord = true;
    if (NStr::EqualNocase(label, "collection-date") || NStr::EqualNocase(label, "country") || NStr::EqualNocase(label, "geo-loc-name")) {
        skipStopWord = false;
    }
    // ignore if BioSource value is blank and BioSample value is a stop word
    if (NStr::IsBlank(src_val) && skipStopWord && CBioSource::IsStopWord(sample_val)) {
        return true;
    }

    // ignore if case matches exactly
    if (s_MayIgnoreCase(label)) {
        if (NStr::EqualNocase(src_val, sample_val)) {
            return true;
        }
    } else {
        if (NStr::EqualCase(src_val, sample_val)) {
            return true;
        }
    }

    if (!NStr::IsBlank(src_val) && !NStr::IsBlank(sample_val)) {
        try {
            CSubSource::TSubtype subtype = CSubSource::GetSubtypeValue(label);
            string test_val = CSubSource::AutoFix(subtype, sample_val);
            if (!NStr::IsBlank(test_val)) {
                if (NStr::Equal(src_val, test_val)) {
                    return true;
                }
            }
        } catch (...) {
            try {
                COrgMod::TSubtype subtype = COrgMod::GetSubtypeValue(label);
                string test_val = COrgMod::AutoFix(subtype, sample_val);
                if (!NStr::IsBlank(test_val)) {
                    if (NStr::Equal(src_val, test_val)) {
                        return true;
                    }
                }
            } catch (...) {
            }
        }
    }

    for (i = 0; i < ArraySize(sIgnoreConflictList); i++) {
        if (NStr::EqualNocase(label, sIgnoreConflictList[i].qual_name)) {
            EConflictIgnoreType ignore_type = sIgnoreConflictList[i].ignore_type;
            if (is_local_copy && ignore_type == eConflictIgnoreMissingInBioSample) {
                ignore_type = eConflictIgnoreAll;
            }
            switch (ignore_type) {
                case eConflictIgnoreAll:
                    rval = true;
                    break;
                case eConflictIgnoreMissingInBioSource:
                    if (NStr::IsBlank(src_val)) {
                      rval = true;
                    }
                    break;
                case eConflictIgnoreMissingInBioSample:
                    if (NStr::IsBlank(sample_val) || (skipStopWord && CBioSource::IsStopWord(sample_val))) {
                      rval = true;
                    }
                    break;
            }
            break;
        }
    }
#if 0
    // special handling for lat-lon
    // commented out for SQD-4173
    if (!rval && NStr::EqualNocase(label, "lat-lon")) {
        bool src_format_correct, src_precision_correct,
             src_lat_in_range, src_lon_in_range;
        double src_lat_value, src_lon_value;
        CSubSource::IsCorrectLatLonFormat(src_val, src_format_correct, src_precision_correct,
                                          src_lat_in_range, src_lon_in_range,
                                          src_lat_value, src_lon_value);
        bool smpl_format_correct, smpl_precision_correct,
             smpl_lat_in_range, smpl_lon_in_range;
        double smpl_lat_value, smpl_lon_value;
        CSubSource::IsCorrectLatLonFormat(sample_val, smpl_format_correct, smpl_precision_correct,
                                          smpl_lat_in_range, smpl_lon_in_range,
                                          smpl_lat_value, smpl_lon_value);
        if (src_format_correct && smpl_format_correct
            && s_SameExceptPrecision(src_lat_value, smpl_lat_value)
            && s_SameExceptPrecision(src_lon_value, smpl_lon_value)) {
            rval = true;
        }
    }
#endif
    // special handling for collection-date
    if (!rval && NStr::EqualNocase(label, "collection-date")) {
        try {
            CRef<CDate> src_date = CSubSource::DateFromCollectionDate(src_val);
            CRef<CDate> smpl_date = CSubSource::DateFromCollectionDate(sample_val);
            if (src_date && smpl_date && src_date->Equals(*smpl_date)) {
                rval = true;
            }
        } catch (...) {
        }
    }
    // special handling for country
    if (!rval && NStr::EqualNocase(label, "country")) {
        NStr::ReplaceInPlace(src_val, ": ", ":");
        NStr::ReplaceInPlace(sample_val, ": ", ":");
        if (NStr::Equal(src_val, sample_val)) {
            rval = true;
        }
    }
    // special handling for altitude
    if (!rval && NStr::EqualNocase(label, "altitude")) {
        if (NStr::EndsWith(src_val, ".") && !NStr::EndsWith(sample_val, ".")
            && NStr::EqualNocase(src_val.substr(0, src_val.length() - 1), sample_val)) {
            rval = true;
        }
    }
    return rval;
}


void CompareValLists(TFieldDiffList& list, const string& val_name, bool is_local_copy, const vector<string>& list1, const vector<string>& list2)
{
    vector<bool> matched1;
    for (size_t i = 0; i < list1.size(); i++) {
        matched1.push_back(false);
    }

    vector<bool> matched2;
    for (size_t i = 0; i < list2.size(); i++) {
        matched2.push_back(false);
    }
    for (size_t i = 0; i < list1.size(); i++) {
        bool found = false;
        for (size_t j = 0; j < list2.size(); j++) {
            if (matched2[j]) {
                // already in use
            } else if (CBioSource::ShouldIgnoreConflict(val_name, list1[i], list2[j], is_local_copy)) {
                matched2[j] = true;
                found = true;
                break;
            }
        }
        if (found) {
            matched1[i] = true;
        }
    }
    for (size_t i = 0; i < list1.size(); i++) {
        if (!matched1[i]) {
            bool reported = false;
            for (size_t j = 0; j < list2.size(); j++) {
                if (!matched2[j]) {
                    CRef<CFieldDiff> diff(new CFieldDiff(val_name, list1[i], list2[j]));
                    list.push_back(diff);
                    reported = true;
                    matched2[j] = true;
                    break;
                }
            }
            if (!reported) {
                CRef<CFieldDiff> diff(new CFieldDiff(val_name, list1[i], ""));
                list.push_back(diff);
            }
        }
    }
    for (size_t j = 0; j < list2.size(); j++) {
        if (!matched2[j]) {
            CRef<CFieldDiff> diff(new CFieldDiff(val_name, "", list2[j]));
            list.push_back(diff);
        }
    }
}


void GetFieldDiffsFromNameValLists(TFieldDiffList& list,
                                   CBioSource::TNameValList& list1,
                                   CBioSource::TNameValList& list2,
                                   bool is_local_copy)
{
    CBioSource::TNameValList::iterator it1 = list1.begin();
    CBioSource::TNameValList::iterator it2 = list2.begin();
    vector<bool> matched;

    while (it1 != list1.end() && it2 != list2.end()) {
        int cmp = NStr::Compare(it1->first, it2->first);
        if (cmp < 0) {
            if (!CBioSource::ShouldIgnoreConflict(it1->first, it1->second, "", is_local_copy)) {
                CRef<CFieldDiff> diff(new CFieldDiff(it1->first, it1->second, ""));
                list.push_back(diff);
            }
            it1++;
        } else if (cmp > 0) {
            if (!CBioSource::ShouldIgnoreConflict(it2->first, "", it2->second, is_local_copy)) {
                CRef<CFieldDiff> diff(new CFieldDiff(it2->first, "", it2->second));
                list.push_back(diff);
            }
            it2++;
        } else {
            // cmp == 0
            const string& val_name = it1->first;
            vector<string> v1;
            vector<string> v2;
            v1.push_back(it1->second);
            v2.push_back(it2->second);
            it1++;
            it2++;
            while (it1 != list1.end() && NStr::Equal(it1->first, val_name)) {
                v1.push_back(it1->second);
                it1++;
            }
            while (it2 != list2.end() && NStr::Equal(it2->first, val_name)) {
                v2.push_back(it2->second);
                it2++;
            }

            CompareValLists(list, val_name, is_local_copy, v1, v2);
        }
    }
    while (it1 != list1.end()) {
        if (!CBioSource::ShouldIgnoreConflict(it1->first, it1->second, "", is_local_copy)) {
            CRef<CFieldDiff> diff(new CFieldDiff(it1->first, it1->second, ""));
            list.push_back(diff);
        }
        it1++;
    }
    while (it2 != list2.end()) {
        if (!CBioSource::ShouldIgnoreConflict(it2->first, "", it2->second, is_local_copy)) {
            CRef<CFieldDiff> diff(new CFieldDiff(it2->first, "", it2->second));
            list.push_back(diff);
        }
        it2++;
    }
}


void CBioSource::x_RemoveNameElementDiffs(const CBioSource& biosample, TFieldDiffList& diff_list) const
{
    string src_tax = "";
    if (IsSetOrg() && GetOrg().IsSetTaxname()) {
        src_tax = GetOrg().GetTaxname();
    }
    string sample_tax = "";
    if (biosample.IsSetOrg() && biosample.GetOrg().IsSetTaxname()) {
        sample_tax = biosample.GetOrg().GetTaxname();
    }
    TFieldDiffList::iterator it = diff_list.begin();
    while (it != diff_list.end()) {
        bool remove = false;
        if (s_IsTaxNameElement((*it)->GetFieldName())) {
            if (NStr::IsBlank((*it)->GetSampleVal())
                && NStr::Find(sample_tax, (*it)->GetSrcVal()) != string::npos) {
                // if value is missing from BioSample, but is present in BioSample taxname,
                // ignore
                remove = true;
            } else if (NStr::IsBlank((*it)->GetSrcVal())
                       && NStr::Find(src_tax, (*it)->GetSampleVal()) != string::npos) {
                // if value is missing from BioSource, but is present in BioSource taxname,
                // ignore
                remove = true;
            }
        }
        if (remove) {
            it = diff_list.erase(it);
        } else {
            it++;
        }
    }
}


void RemoveDiffByName(TFieldDiffList& diff_list, string pair_name)
{
    TFieldDiffList::iterator it = diff_list.begin();
    while (it != diff_list.end()) {
        if (NStr::EqualNocase((*it)->GetFieldName(), pair_name)) {
            it = diff_list.erase(it);
        } else {
            it++;
        }
    }
}


bool CBioSource::x_ShouldIgnoreNoteForBiosample() const
{
    if (IsSetOrg() && GetOrg().IsSetOrgname() && GetOrg().GetOrgname().IsSetLineage()
        && NStr::Find(GetOrg().GetOrgname().GetLineage(), "unclassified sequences; metagenomes") != string::npos) {
        return true;
    } else {
        return false;
    }
}


TFieldDiffList CBioSource::GetBiosampleDiffs(const CBioSource& biosample, bool is_local_copy) const
{
    TFieldDiffList rval;

    TNameValList src_list = GetNameValPairs();
    sort(src_list.begin(), src_list.end(), s_CompareNameVals);

    TNameValList sample_list = biosample.GetNameValPairs();
    sort(sample_list.begin(), sample_list.end(), s_CompareNameVals);

    GetFieldDiffsFromNameValLists(rval, src_list, sample_list, is_local_copy);
    // commented out, SQD-4222
    //x_RemoveNameElementDiffs(biosample, rval);

    if (x_ShouldIgnoreNoteForBiosample() && biosample.x_ShouldIgnoreNoteForBiosample()) {
        RemoveDiffByName(rval, "orgmod_note");
        RemoveDiffByName(rval, "subsrc_note");
    }

    return rval;
}

static const std::set<std::string> s_insdcNullTerms = {
    "-",
    "?",
    "missing: control sample",
    "missing: data agreement established pre-2023",
    "missing: endangered species",
    "missing: human-identifiable",
    "missing: lab stock",
    "missing: sample group",
    "missing: synthetic construct",
    "missing: third party data",
    "missing",
    "n/a",
    "na",
    "none",
    "not applicable",
    "not available",
    "not collected",
    "not determined",
    "not provided",
    "not recorded",
    "null",
    "restricted access",
    "unk",
    "unknown",
    "unspecified"
};


set<string> GetINSDCMissingValues()
{
    return s_insdcNullTerms;
}

static const char* const s_StopWords[] = {
    "-",
    "?",
    "missing",
    "missing: control sample",
    "missing: data agreement established pre-2023",
    "missing: endangered species",
    "missing: human-identifiable",
    "missing: lab stock",
    "missing: sample group",
    "missing: synthetic construct",
    "missing: third party data",
    "n/a",
    "na",
    "none",
    "not applicable",
    "not available",
    "not collected",
    "not determined",
    "not provided",
    "not recorded",
    "null",
    "restricted access",
    "unk",
    "unknown",
    "unspecified"
};

typedef CStaticArraySet<const char*, PNocase_CStr> TCStopWordStrSet;
static const TCStopWordStrSet s_StopWordsSet(s_StopWords, sizeof(s_StopWords), __FILE__, __LINE__);


bool CBioSource::IsStopWord(const string& value)
{
    if (s_StopWordsSet.find(value.c_str()) != s_StopWordsSet.end()) {
        return true;
    } else {
        return false;
    }
}


void CBioSource::AutoFix()
{
    if (IsSetSubtype()) {
        CBioSource::TSubtype::iterator it = SetSubtype().begin();
        while (it != SetSubtype().end()) {
            (*it)->AutoFix();
            if ((*it)->IsSetSubtype()
                && !CSubSource::NeedsNoText((*it)->GetSubtype())
                && (!(*it)->IsSetName() || NStr::IsBlank((*it)->GetName()))) {
                it = SetSubtype().erase(it);
            } else {
                it++;
            }
        }
        if (GetSubtype().empty()) {
            ResetSubtype();
        }
    }
    if (IsSetOrg() && GetOrg().IsSetOrgname() && GetOrg().GetOrgname().IsSetMod()) {
        COrgName::TMod::iterator it = SetOrg().SetOrgname().SetMod().begin();
        while (it != SetOrg().SetOrgname().SetMod().end()) {
            (*it)->AutoFix();
            if ((*it)->IsSetSubtype()
                && (!(*it)->IsSetSubname() || NStr::IsBlank((*it)->GetSubname()))) {
                it = SetOrg().SetOrgname().SetMod().erase(it);
            } else {
                it++;
            }
        }
        if (GetOrg().GetOrgname().GetMod().empty()) {
            SetOrg().SetOrgname().ResetMod();
        }
    }
}


void CBioSource::RemoveCultureNotes(bool is_species_level)
{
    if (IsSetSubtype()) {
        CBioSource::TSubtype::iterator it = SetSubtype().begin();
        while (it != SetSubtype().end()) {
            if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == CSubSource::eSubtype_other) {
                CSubSource::RemoveCultureNotes((*it)->SetName(), is_species_level);
                if (NStr::IsBlank((*it)->GetName())) {
                    it = SetSubtype().erase(it);
                } else {
                    it++;
                }
            } else {
                it++;
            }
        }
        if (GetSubtype().empty()) {
            ResetSubtype();
        }
    }
}

static const char* s_SpecialLineageWords[] = {
    "Class",
    "Classification",
    "Domain",
    "Family",
    "Genus",
    "Kingdom",
    "Lineage",
    "Note",
    "Order",
    "Organism",
    "Phylum",
    "Species",
    "Superfamily",
    "Tax class/lineage",
    "Taxonomic classification",
    "Taxonomic Classification is",
    "Taxonomy"
};

typedef vector<CTempString> TWordList;

// workaround for std::replace_if
static inline bool s_IsPunct( char ch ) { return ispunct((unsigned char)ch) != 0; }

static void s_GetWordListFromText(string& str, TWordList& word_list)
{
    if (str.empty()) {
        return;
    }
    std::replace_if(str.begin(), str.end(), s_IsPunct, ' ');
    NStr::Split(str, " ", word_list, NStr::fSplit_Tokenize);
}


static bool s_DoesTextContainOnlyTheseWords(const string& text, const TWordList& word_list)
{
    if (text.empty()) {
        return false;
    }
    bool match = true, at_least_one = false;

    const char* orig = text.c_str();
    const char* ch = orig;

    while (isspace((unsigned char)(*ch)) || ispunct((unsigned char)(*ch))) {
        ++ch;
    }
    while (*ch != 0 && match) {
        match = false;
        for (TWordList::const_iterator word = word_list.begin(); word != word_list.end() && !match; ++word) {
            size_t length = word->size();
            if (NStr::strncasecmp(ch, word->data(), length) == 0) {
                unsigned char next = *(ch + length);
                if (next == '\0' || isspace(next) || ispunct(next)) {
                    match = true;
                    ch += length;
                    at_least_one = true;
                }
            }
        }
        while (isspace((unsigned char)(*ch)) || ispunct((unsigned char)(*ch))) {
            ++ch;
        }
    }
    return (match && at_least_one);
}


bool CBioSource::RemoveLineageSourceNotes()
{
    if (!IsSetOrg()  || !IsSetLineage() || GetOrg().GetTaxId() == ZERO_TAX_ID) {
        return false;
    }
    bool any_removed = false;

    // gather all words that appear in lineage, taxname and in s_SpecialLineageWords
    TWordList word_list;

    string lineage(GetLineage());
    s_GetWordListFromText(lineage, word_list);

    string taxname(GetTaxname());
    s_GetWordListFromText(taxname, word_list);

    for (unsigned int i = 0; i < ArraySize(s_SpecialLineageWords); ++i) {
        word_list.push_back(s_SpecialLineageWords[i]);
    }

    if (IsSetSubtype()) {
        CBioSource::TSubtype::iterator it = SetSubtype().begin();
        while (it != SetSubtype().end()) {
            CRef<CSubSource> subsrc = *it;
            bool removed = false;
            if (subsrc->IsSetSubtype() && subsrc->GetSubtype() == CSubSource::eSubtype_other) {
                if (subsrc->IsSetName()) {
                    if (s_DoesTextContainOnlyTheseWords(subsrc->GetName(), word_list)) {
                        // remove this subsource note
                        it = SetSubtype().erase(it);
                        removed = true;
                        any_removed = true;
                    }
                }
            }
            if (!removed) {
                ++it;
            }
        }
        if (GetSubtype().empty()) {
            ResetSubtype();
        }
    }

    if (IsSetOrgname() && GetOrg().GetOrgname().IsSetMod()) {
        COrgName::TMod::iterator iter = SetOrg().SetOrgname().SetMod().begin();
        while (iter != SetOrg().SetOrgname().SetMod().end()) {
            CRef<COrgMod> orgmod = *iter;
            bool removed = false;
            if (orgmod->IsSetSubtype() && orgmod->GetSubtype() == COrgMod::eSubtype_other) {
                if (orgmod->IsSetSubname()) {
                    if (s_DoesTextContainOnlyTheseWords(orgmod->GetSubname(), word_list)) {
                        // remove this orgmod note
                        iter = SetOrg().SetOrgname().SetMod().erase(iter);
                        removed = true;
                        any_removed = true;
                    }
                }
            }
            if (!removed) {
                ++iter;
            }
        }
        if (GetOrg().GetOrgname().GetMod().empty()) {
            SetOrg().SetOrgname().ResetMod();
        }
    }

    return any_removed;
}


bool CBioSource::RemoveSubSource(int subtype)
{
    bool rval = false;

    if (IsSetSubtype()) {
        CBioSource::TSubtype::iterator it = SetSubtype().begin();
        while (it != SetSubtype().end()) {
            if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == subtype) {
                it = SetSubtype().erase(it);
                rval = true;
            } else {
                it++;
            }
        }
        if (GetSubtype().empty()) {
            ResetSubtype();
        }
    }
    return rval;
}


bool CBioSource::RemoveSubSource(int subtype, const string& val)
{
    bool rval = false;

    if (IsSetSubtype()) {
        CBioSource::TSubtype::iterator it = SetSubtype().begin();
        while (it != SetSubtype().end()) {
            if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == subtype &&
                (*it)->IsSetName() && NStr::Equal((*it)->GetName(), val)) {
                it = SetSubtype().erase(it);
                rval = true;
            } else {
                it++;
            }
        }
        if (GetSubtype().empty()) {
            ResetSubtype();
        }
    }
    return rval;
}


bool CBioSource::RemoveOrgMod(int subtype)
{
    bool rval = false;

    if (IsSetOrg() && GetOrg().IsSetOrgname() && GetOrg().GetOrgname().IsSetMod()) {
        COrgName::TMod::iterator it = SetOrg().SetOrgname().SetMod().begin();
        while (it != SetOrg().SetOrgname().SetMod().end()) {
            if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == subtype) {
                it = SetOrg().SetOrgname().SetMod().erase(it);
                rval = true;
            } else {
                it++;
            }
        }
        if (GetOrg().GetOrgname().GetMod().empty()) {
            SetOrg().SetOrgname().ResetMod();
        }
    }
    return rval;
}


bool CBioSource::RemoveOrgMod(int subtype, const string& val)
{
    bool rval = false;

    if (IsSetOrg() && GetOrg().IsSetOrgname() && GetOrg().GetOrgname().IsSetMod()) {
        COrgName::TMod::iterator it = SetOrg().SetOrgname().SetMod().begin();
        while (it != SetOrg().SetOrgname().SetMod().end()) {
            if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == subtype &&
                (*it)->IsSetSubname() && NStr::Equal((*it)->GetSubname(), val)) {
                it = SetOrg().SetOrgname().SetMod().erase(it);
                rval = true;
            } else {
                it++;
            }
        }
        if (GetOrg().GetOrgname().GetMod().empty()) {
            SetOrg().SetOrgname().ResetMod();
        }
    }
    return rval;
}


bool CBioSource::FixEnvironmentalSample()
{
    bool has_env_sample = false;
    bool has_metagenomic = false;
    bool any_change = false;

    if (IsSetSubtype()) {
        ITERATE(CBioSource::TSubtype, s, GetSubtype()) {
            if ((*s)->IsSetSubtype()) {
                if ((*s)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
                    has_env_sample = true;
                } else if ((*s)->GetSubtype() == CSubSource::eSubtype_metagenomic) {
                    has_metagenomic = true;
                }
                if (has_env_sample && has_metagenomic) {
                    break;
                }
            }
        }
    }

    if (!has_env_sample &&
        IsSetOrg() &&
        GetOrg().IsSetTaxname() &&
        NStr::StartsWith(GetOrg().GetTaxname(), "uncultured ")) {
        //If taxname starts with uncultured, set environmental - sample to true
        SetSubtype().push_back(CRef<CSubSource>(new CSubSource(CSubSource::eSubtype_environmental_sample, "")));
        has_env_sample = true;
        any_change = true;
    }

    if (has_metagenomic && !has_env_sample) {
        // If metagenomic, set environmental_sample
        SetSubtype().push_back(CRef<CSubSource>(new CSubSource(CSubSource::eSubtype_environmental_sample, "")));
        has_env_sample = true;
        any_change = true;
    }

    if (!has_env_sample &&
        IsSetOrg() && GetOrg().IsSetOrgname() &&
        GetOrg().GetOrgname().IsSetDiv() &&
        NStr::Equal(GetOrg().GetOrgname().GetDiv(), "ENV")) {
        // Add environmental_sample to BioSource if BioSource.org.orgname.div == "ENV"
        SetSubtype().push_back(CRef<CSubSource>(new CSubSource(CSubSource::eSubtype_environmental_sample, "")));
        has_env_sample = true;
        any_change = true;
    }

    if (IsSetOrg() && GetOrg().IsSetOrgname() &&
        GetOrg().GetOrgname().IsSetLineage() &&
        NStr::Find(GetOrg().GetOrgname().GetLineage(), "metagenomes") != string::npos) {
        // Add metagenomic(and environmental_sample) if BioSource.org.orgname.lineage contains "metagenomes"
        if (!has_env_sample) {
            SetSubtype().push_back(CRef<CSubSource>(new CSubSource(CSubSource::eSubtype_environmental_sample, "")));
            has_env_sample = true;
            any_change = true;
        }
        if (!has_metagenomic) {
            SetSubtype().push_back(CRef<CSubSource>(new CSubSource(CSubSource::eSubtype_metagenomic, "")));
            has_metagenomic = true;
            any_change = true;
        }
    }

    if (IsSetOrg() && GetOrg().IsSetOrgname() &&
        GetOrg().GetOrgname().IsSetMod()) {
        // Add metagenomic(and environmental_sample) if BioSource has /metagenome_source qualifier
        bool has_metagenome_source = false;
        ITERATE(COrgName::TMod, m, GetOrg().GetOrgname().GetMod()) {
            if ((*m)->IsSetSubtype() && (*m)->GetSubtype() == COrgMod::eSubtype_metagenome_source) {
                has_metagenome_source = true;
                break;
            }
        }
        if (has_metagenome_source) {
            if (!has_env_sample) {
                SetSubtype().push_back(CRef<CSubSource>(new CSubSource(CSubSource::eSubtype_environmental_sample, "")));
                has_env_sample = true;
                any_change = true;
            }
            if (!has_metagenomic) {
                SetSubtype().push_back(CRef<CSubSource>(new CSubSource(CSubSource::eSubtype_metagenomic, "")));
                has_metagenomic = true;
                any_change = true;
            }
        }
    }
    return any_change;
}


bool CBioSource::RemoveNullTerms()
{
    bool any_change = false;

    if (IsSetSubtype()) {
        CBioSource::TSubtype::iterator s = SetSubtype().begin();
        while (s != SetSubtype().end()) {
            if ((*s)->IsSetSubtype()) {
                if ((*s)->GetSubtype() == CSubSource::eSubtype_country || (*s)->GetSubtype() == CSubSource::eSubtype_collection_date) {
                    // skip "missing" null exemption value (RW-1944)
                    if ((*s)->IsSetName() && NStr::EqualNocase((*s)->GetName(), "missing")) {
                        ++s;
                        continue;
                    }
                }
            }
            if ((*s)->IsSetName() &&
                (NStr::EqualNocase((*s)->GetName(), "Missing")
                 || NStr::EqualNocase((*s)->GetName(), "N/A"))) {
                s = SetSubtype().erase(s);
                any_change = true;
            } else {
                ++s;
            }
        }
        if (GetSubtype().empty()) {
            ResetSubtype();
            any_change = true;
        }
    }
    if (IsSetOrg() && GetOrg().IsSetOrgname()
        && GetOrg().GetOrgname().IsSetMod()) {
        COrgName::TMod::iterator m = SetOrg().SetOrgname().SetMod().begin();
        while (m != SetOrg().SetOrgname().SetMod().end()) {
            if ((*m)->IsSetSubname() &&
                (NStr::EqualNocase((*m)->GetSubname(), "Missing")
                || NStr::EqualNocase((*m)->GetSubname(), "N/A"))) {
                m = SetOrg().SetOrgname().SetMod().erase(m);
                any_change = true;
            } else {
                ++m;
            }
        }
        if (GetOrg().GetOrgname().GetMod().empty()) {
            SetOrg().SetOrgname().ResetMod();
            any_change = true;
        }
    }

    return any_change;
}


bool CBioSource::IsViral(const string& lineage)
{
    if (NStr::StartsWith(lineage, "Viruses; ", NStr::eNocase)) {
        return true;
    } else {
        return false;
    }
}


bool CBioSource::IsViral() const
{
    if (IsSetOrg() && GetOrg().IsSetLineage()) {
        return IsViral(GetOrg().GetLineage());
    } else {
        return false;
    }
}


bool CBioSource::AllowSexQualifier(const string& lineage)
{
    bool isViral = IsViral(lineage);
    bool isBacteria = false;
    bool isArchaea = false;
    bool isFungal = false;

    if (NStr::StartsWith(lineage, "Bacteria; ", NStr::eNocase)) {
        isBacteria = true;
    } else if (NStr::StartsWith(lineage, "Archaea; ", NStr::eNocase)) {
        isArchaea = true;
    } else if (NStr::StartsWith(lineage, "Eukaryota; Fungi; ", NStr::eNocase)) {
        isFungal = true;
    }

    if (isViral || isBacteria || isArchaea || isFungal) {
        return false;
    } else {
        return true;
    }
}


bool CBioSource::AllowSexQualifier() const
{
    if (!IsSetOrg() || !GetOrg().IsSetOrgname() || !GetOrg().GetOrgname().IsSetLineage()) {
        return true;
    } else {
        return AllowSexQualifier(GetOrg().GetOrgname().GetLineage());
    }
}


bool CBioSource::AllowMatingTypeQualifier(const string& lineage)
{
    bool isViral = IsViral(lineage);
    bool isAnimal = false;
    bool isPlant = false;

    if (NStr::StartsWith(lineage, "Eukaryota; Metazoa; ", NStr::eNocase)) {
        isAnimal = true;
    } else if (NStr::StartsWith(lineage, "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; ", NStr::eNocase)
        || NStr::StartsWith(lineage, "Eukaryota; Rhodophyta; ", NStr::eNocase)
        || NStr::StartsWith(lineage, "Eukaryota; stramenopiles; Phaeophyceae; ", NStr::eNocase)) {
        isPlant = true;
    }

    if (isViral || isAnimal || isPlant) {
        return false;
    } else {
        return true;
    }
}


bool CBioSource::AllowMatingTypeQualifier() const
{
    if (!IsSetOrg() || !GetOrg().IsSetOrgname() || !GetOrg().GetOrgname().IsSetLineage()) {
        return true;
    } else {
        return AllowMatingTypeQualifier(GetOrg().GetOrgname().GetLineage());
    }
}


bool CBioSource::FixSexMatingTypeInconsistencies()
{
    bool any_change = false;
    if (!IsSetSubtype()) {
        return false;
    }
    TSubtype::iterator it = SetSubtype().begin();
    while (it != SetSubtype().end()) {
        bool remove = false;
        if ((*it)->IsSetSubtype()) {
            if ((*it)->GetSubtype() == CSubSource::eSubtype_sex && !AllowSexQualifier()) {
                remove = true;
            } else if ((*it)->GetSubtype() == CSubSource::eSubtype_mating_type) {
                if ((*it)->IsSetName() && AllowSexQualifier()
                     && CSubSource::IsValidSexQualifierValue((*it)->GetName())) {
                    (*it)->SetSubtype(CSubSource::eSubtype_sex);
                    any_change = true;
                } else if (!AllowMatingTypeQualifier()) {
                    remove = true;
                }
            }
        }
        if (remove) {
            it = SetSubtype().erase(it);
            any_change = true;
        } else {
            ++it;
        }
    }

    if (GetSubtype().size() == 0) {
        ResetSubtype();
        any_change = true;
    }

    return any_change;
}


bool CBioSource::RemoveUnexpectedViralQualifiers()
{
    if (!IsViral()  || !IsSetOrg() || !GetOrg().IsSetOrgname() ||
        !GetOrg().GetOrgname().IsSetMod()) {
        return false;
    }

    bool any_change = false;
    COrgName::TMod::iterator m = SetOrg().SetOrgname().SetMod().begin();
    while (m != SetOrg().SetOrgname().SetMod().end()){
        if ((*m)->IsUnexpectedViralOrgModQualifier()) {
            m = SetOrg().SetOrgname().SetMod().erase(m);
            any_change = true;
        } else {
            ++m;
        }
    }
    if (GetOrg().GetOrgname().GetMod().empty()) {
        SetOrg().SetOrgname().ResetMod();
        any_change = true;
    }
    return any_change;
}


bool CBioSource::FixGenomeForQualifiers()
{
    if (HasSubtype(CSubSource::eSubtype_plasmid_name) && (!IsSetGenome() || GetGenome() == eGenome_unknown)) {
        SetGenome(eGenome_plasmid);
        return true;
    } else {
        return false;
    }
}


#define MAKE_COMMON_INT(o1,o2,o3,Field) \
    if (o1.IsSet##Field() && o2.IsSet##Field() && o1.Get##Field() == o2.Get##Field()) o3.Set##Field(o1.Get##Field());


CRef<CBioSource> CBioSource::MakeCommonExceptOrg(const CBioSource& other) const
{
    CRef<CBioSource> common_src(new CBioSource());

    // copy common subtypes
    if (IsSetSubtype() && other.IsSetSubtype()) {
        ITERATE(TSubtype, it1, GetSubtype()) {
            bool found = false;
            ITERATE(TSubtype, it2, other.GetSubtype()) {
                if ((*it1)->Equals(**it2)) {
                    found = true;
                    break;
                }
            }
            if (found) {
                CRef<CSubSource> add(new CSubSource());
                add->Assign(**it1);
                common_src->SetSubtype().push_back(add);
            }
        }
    }

    MAKE_COMMON_INT((*this), other, (*common_src), Genome);
    MAKE_COMMON_INT((*this), other, (*common_src), Origin);

    if (IsSetPcr_primers() && other.IsSetPcr_primers() && GetPcr_primers().Equals(other.GetPcr_primers())) {
        common_src->SetPcr_primers().Assign(GetPcr_primers());
    }

    return common_src;
}


CRef<CBioSource> CBioSource::MakeCommon( const CBioSource& other) const
{
    if (!IsSetOrg() || !other.IsSetOrg()) {
        return CRef<CBioSource>(NULL);
    }

    CRef<COrg_ref> common_org = GetOrg().MakeCommon(other.GetOrg());
    if (!common_org) {
        return CRef<CBioSource>(NULL);
    }

    CRef<CBioSource> common_src = MakeCommonExceptOrg(other);
    common_src->SetOrg().Assign(*common_org);

    return common_src;
}


bool CBioSource::HasSubtype(CSubSource::TSubtype subtype) const
{
    if (!IsSetSubtype()) {
        return false;
    }
    ITERATE(TSubtype, it, GetSubtype()) {
        if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == subtype) {
            return true;
        }
    }
    return false;
}

END_objects_SCOPE // namespace ncbi::objects::

END_NCBI_SCOPE

/* Original file checksum: lines: 64, chars: 1883, CRC32: e1194deb */
