// go-encode-id.cc -- Go identifier and packagepath encoding/decoding hooks

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "go-system.h"

#include "gogo.h"
#include "go-location.h"
#include "go-linemap.h"
#include "go-encode-id.h"
#include "lex.h"

// Return whether the character c can appear in a name that we are
// encoding.  We only permit ASCII alphanumeric characters.

static bool
char_needs_encoding(char c)
{
  switch (c)
    {
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    case 'Y': case 'Z':
    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    case 'y': case 'z':
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
      return false;
    default:
      return true;
    }
}

// Return whether the identifier needs to be translated because it
// contains non-ASCII characters.

bool
go_id_needs_encoding(const std::string& str)
{
  for (std::string::const_iterator p = str.begin();
       p != str.end();
       ++p)
    if (char_needs_encoding(*p))
      return true;
  return false;
}

// Map from characters to the underscore encoding for them.

class Special_char_code
{
 public:
  Special_char_code();

  // Return the simple underscore encoding for C, or 0 if none.
  char
  code_for(unsigned int c) const
  {
    if (c <= 127)
      return this->codes_[c];
    return 0;
  }

 private:
  // Encodings for characters.
  char codes_[128];
};

// Construct the underscore encoding map.

Special_char_code::Special_char_code()
{
  memset(this->codes_, 0, sizeof this->codes_);
  this->codes_['_'] = '_';
  this->codes_['.'] = '0';
  this->codes_['/'] = '1';
  this->codes_['*'] = '2';
  this->codes_[','] = '3';
  this->codes_['{'] = '4';
  this->codes_['}'] = '5';
  this->codes_['['] = '6';
  this->codes_[']'] = '7';
  this->codes_['('] = '8';
  this->codes_[')'] = '9';
  this->codes_['"'] = 'a';
  this->codes_[' '] = 'b';
  this->codes_[';'] = 'c';
}

// The singleton Special_char_code.

static const Special_char_code special_char_code;

// Pull the next UTF-8 character out of P and store it in *PC.  Return
// the number of bytes read.

static size_t
fetch_utf8_char(const char* p, unsigned int* pc)
{
  unsigned char c = *p;
  if ((c & 0x80) == 0)
    {
      *pc = c;
      return 1;
    }
  size_t len = 0;
  while ((c & 0x80) != 0)
    {
      ++len;
      c <<= 1;
    }
  unsigned int rc = *p & ((1 << (7 - len)) - 1);
  for (size_t i = 1; i < len; i++)
    {
      unsigned int u = p[i];
      rc <<= 6;
      rc |= u & 0x3f;
    }
  *pc = rc;
  return len;
}

// Encode an identifier using assembler-friendly characters.  The
// encoding is described in detail near the end of the long comment at
// the start of names.cc.

std::string
go_encode_id(const std::string &id)
{
  if (Lex::is_invalid_identifier(id))
    {
      go_assert(saw_errors());
      return id;
    }

  std::string ret;
  const char* p = id.c_str();
  const char* pend = p + id.length();

  // We encode a leading digit, to ensure that no identifier starts
  // with a digit.
  if (pend > p && p[0] >= '0' && p[0] <= '9')
    {
      char buf[8];
      snprintf(buf, sizeof buf, "_x%02x", p[0]);
      ret.append(buf);
      ++p;
    }

  while (p < pend)
    {
      unsigned int c;
      size_t len = fetch_utf8_char(p, &c);
      if (len == 1)
	{
	  if (!char_needs_encoding(c))
	    ret.push_back(c);
	  else
	    {
	      char code = special_char_code.code_for(c);
	      if (code != 0)
		{
		  ret.push_back('_');
		  ret.push_back(code);
		}
	      else
		{
		  char buf[8];
		  snprintf(buf, sizeof buf, "_x%02x", c);
		  ret.append(buf);
		}
	    }
	}
      else
	{
	  char buf[16];
	  if (c < 0x10000)
	    snprintf(buf, sizeof buf, "_u%04x", c);
	  else
	    snprintf(buf, sizeof buf, "_U%08x", c);
	  ret.append(buf);
	}

      p += len;
    }

  return ret;
}

// Convert a hex digit string to a unicode codepoint. No checking
// to insure that the hex digit is meaningful.

static unsigned
hex_digits_to_unicode_codepoint(const char *digits, unsigned ndig)
{
  unsigned result = 0;
  for (unsigned i = 0; i < ndig; ++i) {
    result <<= 4;
    result |= Lex::hex_val(digits[i]);
  }
  return result;
}

// Decode/demangle a mangled string produced by go_encode_id(). Returns
// empty string if demangling process fails in some way.  At the moment
// this routine is unused; there is an equivalent routine in the runtime
// used for demangling symbols appearing in stack traces.

std::string
go_decode_id(const std::string &encoded)
{
  std::string ret;
  const char* p = encoded.c_str();
  const char* pend = p + encoded.length();
  const Location loc = Linemap::predeclared_location();

  while (p < pend)
    {
      if (*p != '_' || p + 1 == pend)
	{
	  ret.push_back(*p);
	  p++;
	  continue;
	}

      switch (p[1])
	{
	case '_':
	  ret.push_back('_');
	  p += 2;
	  break;
	case '0':
	  ret.push_back('.');
	  p += 2;
	  break;
	case '1':
	  ret.push_back('/');
	  p += 2;
	  break;
	case '2':
	  ret.push_back('*');
	  p += 2;
	  break;
	case '3':
	  ret.push_back(',');
	  p += 2;
	  break;
	case '4':
	  ret.push_back('{');
	  p += 2;
	  break;
	case '5':
	  ret.push_back('}');
	  p += 2;
	  break;
	case '6':
	  ret.push_back('[');
	  p += 2;
	  break;
	case '7':
	  ret.push_back(']');
	  p += 2;
	  break;
	case '8':
	  ret.push_back('(');
	  p += 2;
	  break;
	case '9':
	  ret.push_back(')');
	  p += 2;
	  break;
	case 'a':
	  ret.push_back('"');
	  p += 2;
	  break;
	case 'b':
	  ret.push_back(' ');
	  p += 2;
	  break;
	case 'c':
	  ret.push_back(';');
	  p += 2;
	  break;
        case 'x':
	  {
	    const char* digits = p + 2;
	    if (strlen(digits) < 2)
	      return "";
	    unsigned int rune = hex_digits_to_unicode_codepoint(digits, 2);
	    Lex::append_char(rune, true, &ret, loc);
	    p += 4;
	  }
	  break;
	case 'u':
	  {
	    const char* digits = p + 2;
	    if (strlen(digits) < 4)
	      return "";
	    unsigned int rune = hex_digits_to_unicode_codepoint(digits, 4);
	    Lex::append_char(rune, true, &ret, loc);
	    p += 6;
	  }
	  break;
	case 'U':
	  {
	    const char* digits = p + 2;
	    if (strlen(digits) < 8)
	      return "";
	    unsigned int rune = hex_digits_to_unicode_codepoint(digits, 8);
	    Lex::append_char(rune, true, &ret, loc);
	    p += 10;
	  }
	  break;
	default:
	  return "";
	}
    }

  return ret;
}

// Encode a struct field tag.  This is only used when we need to
// create a type descriptor for an anonymous struct type with field
// tags.  Underscore encoding will be applied to the returned string.
// The tag will appear between curly braces, so that is all we have to
// avoid.

std::string
go_mangle_struct_tag(const std::string& tag)
{
  std::string ret;
  const char* p = tag.c_str();
  const char* pend = p + tag.length();
  while (p < pend)
    {
      unsigned int c;
      size_t len = fetch_utf8_char(p, &c);
      if (len > 1)
	ret.append(p, len);
      else if (c != '{' && c != '}' && c != '\\')
	ret.push_back(c);
      else
	{
	  ret.push_back('\\');
	  ret.push_back(c);
	}
      p += len;
    }
  return ret;
}