From cfb8ef9f65da630203c6ecd41b6ac13f8ac9c460 Mon Sep 17 00:00:00 2001
From: Adriaan de Groot <groot@kde.org>
Date: Sun, 14 Aug 2022 17:16:31 +0200
Subject: [PATCH] [locale] Use locale-similarity for searching

---
 src/modules/locale/LocaleConfiguration.cpp | 135 ++++++---------------
 src/modules/locale/LocaleNames.h           |   3 +
 2 files changed, 40 insertions(+), 98 deletions(-)

diff --git a/src/modules/locale/LocaleConfiguration.cpp b/src/modules/locale/LocaleConfiguration.cpp
index 17953f079a..c857ca80ff 100644
--- a/src/modules/locale/LocaleConfiguration.cpp
+++ b/src/modules/locale/LocaleConfiguration.cpp
@@ -9,11 +9,13 @@
  */
 
 #include "LocaleConfiguration.h"
+#include "LocaleNames.h"
 
 #include "utils/Logger.h"
 
 #include <QLocale>
 #include <QRegularExpression>
+#include <QVector>
 
 LocaleConfiguration::LocaleConfiguration()
     : explicit_lang( false )
@@ -47,90 +49,45 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale,
                                               const QString& countryCode )
 {
     cDebug() << "Mapping" << languageLocale << "in" << countryCode << "to locale.";
-    QString language = languageLocale.split( '_' ).first();
-    QString region;
-    if ( language.contains( '@' ) )
-    {
-        auto r = language.split( '@' );
-        language = r.first();
-        region = r[ 1 ];  // second()
-    }
-
-    // Either an exact match, or the whole language part matches
-    // (followed by .<encoding> or _<country>
-    QStringList linesForLanguage = availableLocales.filter( QRegularExpression( language + "[._]" ) );
-    cDebug() << Logger::SubEntry << "Matching" << linesForLanguage;
 
+    const QString default_lang = QStringLiteral( "en_US.UTF-8" );
     QString lang;
-    if ( linesForLanguage.isEmpty() || languageLocale.isEmpty() )
-    {
-        lang = "en_US.UTF-8";
-    }
-    else if ( linesForLanguage.length() == 1 )
-    {
-        lang = linesForLanguage.first();
-    }
 
-    // lang could still be empty if we found multiple locales that satisfy myLanguage
-    const QString combinedLanguageAndCountry = QString( "%1_%2" ).arg( language ).arg( countryCode );
-    if ( lang.isEmpty() && region.isEmpty() )
+    const LocaleNameParts self = LocaleNameParts::fromName( languageLocale );
+    if ( self.isValid() && !availableLocales.isEmpty() )
     {
-        auto l = linesForLanguage.filter(
-            QRegularExpression( combinedLanguageAndCountry + "[._]" ) );  // no regional variants
-        if ( l.length() == 1 )
-        {
-            lang = l.first();
-        }
-    }
+        QVector< LocaleNameParts > others;
+        others.resize( availableLocales.length() );  // Makes default structs
+        std::transform( availableLocales.begin(), availableLocales.end(), others.begin(), LocaleNameParts::fromName );
+        std::sort( others.begin(),
+                   others.end(),
+                   [ reference = self ]( const LocaleNameParts& lhs, const LocaleNameParts& rhs )
+                   { return reference.similarity( lhs ) < reference.similarity( rhs ); } );
 
-    // The following block was inspired by Ubiquity, scripts/localechooser-apply.
-    // No copyright statement found in file, assuming GPL v2 or later.
-    /*  # In the special cases of Portuguese and Chinese, selecting a
-        # different location may imply a different dialect of the language.
-        # In such cases, make LANG reflect the selected language (for
-        # messages, character types, and collation) and make the other
-        # locale categories reflect the selected location. */
-    if ( language == "pt" || language == "zh" )
-    {
-        cDebug() << Logger::SubEntry << "Special-case Portuguese and Chinese";
-        QString proposedLocale = QString( "%1_%2" ).arg( language ).arg( countryCode );
-        for ( const QString& line : linesForLanguage )
+        // The best match is at the end
+        LocaleNameParts best_match = others.last();
+        if ( !( self.similarity( best_match ) > LocaleNameParts::no_match ) )
         {
-            if ( line.contains( proposedLocale ) )
-            {
-                cDebug() << Logger::SubEntry << "Country-variant" << line << "chosen.";
-                lang = line;
-                break;
-            }
+            best_match = LocaleNameParts {};
         }
-    }
-    if ( lang.isEmpty() && !region.isEmpty() )
-    {
-        cDebug() << Logger::SubEntry << "Special-case region @" << region;
-        QString proposedRegion = QString( "@%1" ).arg( region );
-        for ( const QString& line : linesForLanguage )
+        // .. but it might match **better** with the chosen location country Code
+        if ( self.similarity( best_match ) < LocaleNameParts::complete_match )
         {
-            if ( line.startsWith( language ) && line.contains( proposedRegion ) )
+            auto self_other_country( self );
+            self_other_country.country = countryCode;
+            std::sort( others.begin(),
+                       others.end(),
+                       [ reference = self_other_country ]( const LocaleNameParts& lhs, const LocaleNameParts& rhs )
+                       { return reference.similarity( lhs ) < reference.similarity( rhs ); } );
+            if ( self_other_country.similarity( others.last() ) > self.similarity( best_match ) )
             {
-                cDebug() << Logger::SubEntry << "Region-variant" << line << "chosen.";
-                lang = line;
-                break;
+                best_match = others.last();
             }
         }
-    }
-
-
-    // If we found no good way to set a default lang, do a search with the whole
-    // language locale and pick the first result, if any.
-    if ( lang.isEmpty() )
-    {
-        for ( const QString& line : availableLocales )
+        if ( best_match.isValid() )
         {
-            if ( line.startsWith( languageLocale ) )
-            {
-                lang = line;
-                break;
-            }
+
+            lang = best_match.name();
         }
     }
 
@@ -138,7 +95,7 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale,
     // en_US.UTF-8 UTF-8. This completes all default language setting guesswork.
     if ( lang.isEmpty() )
     {
-        lang = "en_US.UTF-8";
+        lang = default_lang;
     }
 
 
@@ -188,34 +145,16 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale,
     // We make a proposed locale based on the UI language and the timezone's country. There is no
     // guarantee that this will be a valid, supported locale (often it won't).
     QString lc_formats;
-    const QString combined = QString( "%1_%2" ).arg( language ).arg( countryCode );
-    if ( lang.isEmpty() )
+    const QString combined = QString( "%1_%2" ).arg( self.language ).arg( countryCode );
+    if ( availableLocales.contains( lang ) )
     {
-        cDebug() << Logger::SubEntry << "Looking up formats for" << combinedLanguageAndCountry;
-        // We look up if it's a supported locale.
-        for ( const QString& line : availableLocales )
-        {
-            if ( line.startsWith( combinedLanguageAndCountry ) )
-            {
-                lang = line;
-                lc_formats = line;
-                break;
-            }
-        }
+        cDebug() << Logger::SubEntry << "Exact formats match for language tag" << lang;
+        lc_formats = lang;
     }
-    else
+    else if ( availableLocales.contains( combined ) )
     {
-        if ( availableLocales.contains( lang ) )
-        {
-            cDebug() << Logger::SubEntry << "Exact formats match for language tag" << lang;
-            lc_formats = lang;
-        }
-        else if ( availableLocales.contains( combinedLanguageAndCountry ) )
-        {
-            cDebug() << Logger::SubEntry << "Exact formats match for combined" << combinedLanguageAndCountry;
-            lang = combinedLanguageAndCountry;
-            lc_formats = combinedLanguageAndCountry;
-        }
+        cDebug() << Logger::SubEntry << "Exact formats match for combined" << combined;
+        lc_formats = combined;
     }
 
     if ( lc_formats.isEmpty() )
diff --git a/src/modules/locale/LocaleNames.h b/src/modules/locale/LocaleNames.h
index 247a8496c2..8498aa28a4 100644
--- a/src/modules/locale/LocaleNames.h
+++ b/src/modules/locale/LocaleNames.h
@@ -31,6 +31,9 @@ struct LocaleNameParts
 
     static LocaleNameParts fromName( const QString& name );
 
+    static inline constexpr const int no_match = 0;
+    static inline constexpr const int complete_match = 100;
+
     /** @brief Compute similarity-score with another locale-name.
      *
      * Similarity is driven by language and region, then country.
-- 
GitLab