All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Torsten Bögershausen" <totte.enea@gmail.com>
To: Michael J Gruber <git@drmicha.warpmail.net>
Cc: "Ævar Arnfjörð Bjarmason" <avarab@gmail.com>,
	matthias.moeller@math.tu-dortmund.de, git@vger.kernel.org
Subject: Re: Git, Mac OS X and German special characters
Date: Thu, 20 May 2010 11:02:01 +0200	[thread overview]
Message-ID: <4BF4FA89.2040904@gmail.com> (raw)
In-Reply-To: <4BF4F7D7.60002@drmicha.warpmail.net>

Hej,
I have the same problem here.
Below there is a patch, which may solve the problem.
(Yes, whitespaces are broken. I'm still fighting with
git format-patch -s --cover-letter -M --stdout origin/master | git 
imap-send)
But this patch may be a start point for improvements.
Comments welcome
BR
/Torsten



Improved interwork between Mac OS X and linux when umlauts are used
When a git repository containing utf-8 coded umlaut characters
is cloned onto an Mac OS X machine, the Mac OS system will convert
all filenames returned by readdir() into denormalized utf-8.
As a result of this conversion, git will not find them on disk.
This helps by treating the NFD and NFD version of filenames as
identical on Mac OS.






Signed-off-by: Torsten Bögershausen <tboegi@web.de>
---
name-hash.c |   40 ++++++++++++++++++++++++++++++++++++++++
utf8.c      |   55 ++++++++++++++++++++++++++++++++++++++++++++++++-------
utf8.h      |   11 +++++++++++
3 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/name-hash.c b/name-hash.c
index 0031d78..e6494e8 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -7,6 +7,7 @@
  */
#define NO_THE_INDEX_COMPATIBILITY_MACROS
#include "cache.h"
+#include "utf8.h"

/*
  * This removes bit 5 if bit 6 is set.
@@ -100,6 +101,25 @@ static int same_name(const struct cache_entry *ce, 
const char *name, int namelen
     return icase && slow_same_name(name, namelen, ce->name, len);
}

+#ifdef __APPLE__
+struct cache_entry *index_name_exists2(struct index_state *istate, 
const char *name, int icase)
+{
+    int namelen = (int)strlen(name);
+    unsigned int hash = hash_name(name, namelen);
+    struct cache_entry *ce;
+
+    ce = lookup_hash(hash, &istate->name_hash);
+    while (ce) {
+        if (!(ce->ce_flags & CE_UNHASHED)) {
+            if (same_name(ce, name, namelen, icase))
+                return ce;
+        }
+        ce = ce->next;
+    }
+    return NULL;
+}
+#endif
+
struct cache_entry *index_name_exists(struct index_state *istate, const 
char *name, int namelen, int icase)
{
     unsigned int hash = hash_name(name, namelen);
@@ -115,5 +135,25 @@ struct cache_entry *index_name_exists(struct 
index_state *istate, const char *na
         }
         ce = ce->next;
     }
+#ifdef __APPLE__
+    {
+        char *name_nfc_nfd;
+        name_nfc_nfd = str_nfc2nfd(name);
+        if (name_nfc_nfd) {
+            ce = index_name_exists2(istate, name_nfc_nfd, icase);
+            free(name_nfc_nfd);
+            if (ce)
+                return ce;
+        }
+        name_nfc_nfd = str_nfd2nfc(name);
+        if (name_nfc_nfd) {
+            ce = index_name_exists2(istate, name_nfc_nfd, icase);
+            free(name_nfc_nfd);
+            if (ce)
+                return ce;
+        }
+    }
+#endif
+
     return NULL;
}
diff --git a/utf8.c b/utf8.c
index 84cfc72..8e794dc 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2,6 +2,11 @@
#include "strbuf.h"
#include "utf8.h"

+#ifdef __APPLE__
+static iconv_t my_iconv_nfd2nfc = (iconv_t) -1;
+static iconv_t my_iconv_nfc2nfd = (iconv_t) -1;
+#endif
+
/* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */

struct interval {
@@ -424,18 +429,13 @@ int is_encoding_utf8(const char *name)
#else
     typedef char * iconv_ibp;
#endif
-char *reencode_string(const char *in, const char *out_encoding, const 
char *in_encoding)
+
+char *reencode_string_iconv(const char *in, iconv_t conv)
{
-    iconv_t conv;
     size_t insz, outsz, outalloc;
     char *out, *outpos;
     iconv_ibp cp;

-    if (!in_encoding)
-        return NULL;
-    conv = iconv_open(out_encoding, in_encoding);
-    if (conv == (iconv_t) -1)
-        return NULL;
     insz = strlen(in);
     outsz = insz;
     outalloc = outsz + 1; /* for terminating NUL */
@@ -469,7 +469,48 @@ char *reencode_string(const char *in, const char 
*out_encoding, const char *in_e
             break;
         }
     }
+    return out;
+}
+
+char *reencode_string(const char *in, const char *out_encoding, const 
char *in_encoding)
+{
+    iconv_t conv;
+    char *out;
+
+    if (!in_encoding)
+        return NULL;
+    conv = iconv_open(out_encoding, in_encoding);
+    if (conv == (iconv_t) -1)
+        return NULL;
+    out = reencode_string_iconv(in, conv);
     iconv_close(conv);
     return out;
}
+
+#ifdef __APPLE__
+char*
+str_nfc2nfd(const char *in)
+{
+    if (my_iconv_nfc2nfd == (iconv_t) -1) {
+        my_iconv_nfc2nfd = iconv_open("utf-8-mac", "utf-8");
+        if (my_iconv_nfc2nfd == (iconv_t) -1) {
+            return NULL;
+        }
+    }
+    return reencode_string_iconv(in, my_iconv_nfc2nfd);
+}
+
+char*
+str_nfd2nfc(const char *in)
+{
+    if (my_iconv_nfd2nfc == (iconv_t) -1){
+        my_iconv_nfd2nfc = iconv_open("utf-8", "utf-8-mac");
+        if (my_iconv_nfd2nfc == (iconv_t) -1) {
+            return NULL;
+        }
+    }
+    return reencode_string_iconv(in, my_iconv_nfd2nfc);
+}
+#endif /* APPLE */
+
#endif
diff --git a/utf8.h b/utf8.h
index ebc4d2f..db29c8a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -13,8 +13,19 @@ int strbuf_add_wrapped_text(struct strbuf *buf,

#ifndef NO_ICONV
char *reencode_string(const char *in, const char *out_encoding, const 
char *in_encoding);
+char *reencode_string_iconv(const char *in, iconv_t conv);
+#ifdef __APPLE__
+char *str_nfc2nfd(const char *in);
+char *str_nfd2nfc(const char *in);
+#else
+#define str_nfc2nfd(in) (NULL)
+#define str_nfd2nfc(in) (NULL)
+#endif
#else
#define reencode_string(a,b,c) NULL
+#define reencode_string2(a,b) NULL
+#define str_nfc2nfd(in) (NULL)
+#define str_nfd2nfc(in) (NULL)
#endif

#endif
-- 
1.7.1.dirty










On 20.05.10 10:50, Michael J Gruber wrote:
> Ævar Arnfjörð Bjarmason venit, vidit, dixit 20.05.2010 10:34:
>    
>> On Thu, May 20, 2010 at 07:26, Matthias Moeller
>> <matthias.moeller@math.tu-dortmund.de>  wrote:
>>      
>>> I have been searching the web for help and found lengthy discussions
>>> which state that this is a common problem of the HFS+ filesystem.
>>> What I did not find was a solution to this problem. Is there a solution
>>> to this problem?
>>>        
>> Is this problem particular to Git, or do you also get it if you
>> e.g. rsync from the Linux box to the Mac OS X box?
>>
>>      
>>> #       "U\314\210bersicht.xls"
>>>        
>> You probably have to configure your shell on OSX to render UTF-8
>> correctly. It's just showing the raw escaped byte sequence instead of
>> a character there.
>>
>> There isn't anything wrong with OSX in this case, filename encoding on
>> any POSIX system is only done by convention. You'll find that you have
>> similar problems on Linux if you encode filename in Big5 or
>> UTF-32.
>>
>> Linux will happily accept it, but your shell / other applications will
>> render it as unknown goo because they expect UTF-8.
>>      
> No, the problem with git status is not the display. Matthias' problem is
> that git status reports a tracked file as untracked. The reason is that
> on HFS+, you create a file with name A and get a file with name B, where
> A and B are different representations of the same name. There seems to
> be no way to reliably detect which one HFS+ uses.
>
> Michael
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    

  parent reply	other threads:[~2010-05-20  9:02 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-05-20  7:26 Git, Mac OS X and German special characters Matthias Moeller
2010-05-20  8:34 ` Ævar Arnfjörð Bjarmason
2010-05-20  8:50   ` Michael J Gruber
2010-05-20  8:57     ` demerphq
2010-05-20  9:02     ` Torsten Bögershausen [this message]
2010-05-20  9:15       ` Michael J Gruber
     [not found]         ` <4BF5294E.7060206@web.de>
2010-05-20 14:29           ` Michael J Gruber
2010-05-20 15:30         ` Jay Soffian
2010-05-20 15:50       ` Jay Soffian
2010-05-20 18:22         ` Jay Soffian
2010-05-20  9:16     ` Matthias Moeller
2010-05-20 10:38     ` Thomas Singer
2010-05-20  8:55   ` demerphq
  -- strict thread matches above, loose matches on Subject: below --
2011-10-01 12:44 Albert Zeyer
2011-10-01 13:39 ` Andreas Ericsson
     [not found]   ` <CAO1Q+jeLEp2ReNc9eOFoJxdGq6oRE3b+O=JvMNU0Kqx_eAX=7w@mail.gmail.com>
2011-10-01 14:24     ` Andreas Ericsson
2011-10-01 19:47       ` Andreas Krey
2011-10-01 22:02         ` Michael Witten
2011-10-01 23:14           ` Jakub Narebski
2011-10-01 23:26             ` Michael Witten
2011-10-01 23:48           ` Albert Zeyer
2011-10-03 19:48 ` Torsten Bögershausen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4BF4FA89.2040904@gmail.com \
    --to=totte.enea@gmail.com \
    --cc=avarab@gmail.com \
    --cc=git@drmicha.warpmail.net \
    --cc=git@vger.kernel.org \
    --cc=matthias.moeller@math.tu-dortmund.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.