inputext.cpp 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2005-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. */
  9. #include "unicode/utypes.h"
  10. #if !UCONFIG_NO_CONVERSION
  11. #include "inputext.h"
  12. #include "cmemory.h"
  13. #include "cstring.h"
  14. #include <string.h>
  15. U_NAMESPACE_BEGIN
  16. #define BUFFER_SIZE 8192
  17. #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
  18. #define DELETE_ARRAY(array) uprv_free((void *) (array))
  19. InputText::InputText(UErrorCode &status)
  20. : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
  21. // removed if appropriate.
  22. fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
  23. // Value is percent, not absolute.
  24. fDeclaredEncoding(0),
  25. fRawInput(0),
  26. fRawLength(0)
  27. {
  28. if (fInputBytes == nullptr || fByteStats == nullptr) {
  29. status = U_MEMORY_ALLOCATION_ERROR;
  30. }
  31. }
  32. InputText::~InputText()
  33. {
  34. DELETE_ARRAY(fDeclaredEncoding);
  35. DELETE_ARRAY(fByteStats);
  36. DELETE_ARRAY(fInputBytes);
  37. }
  38. void InputText::setText(const char *in, int32_t len)
  39. {
  40. fInputLen = 0;
  41. fC1Bytes = false;
  42. fRawInput = (const uint8_t *) in;
  43. fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
  44. }
  45. void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
  46. {
  47. if(encoding) {
  48. if (len == -1) {
  49. len = (int32_t)uprv_strlen(encoding);
  50. }
  51. len += 1; // to make place for the \0 at the end.
  52. uprv_free(fDeclaredEncoding);
  53. fDeclaredEncoding = NEW_ARRAY(char, len);
  54. uprv_strncpy(fDeclaredEncoding, encoding, len);
  55. }
  56. }
  57. UBool InputText::isSet() const
  58. {
  59. return fRawInput != nullptr;
  60. }
  61. /**
  62. * MungeInput - after getting a set of raw input data to be analyzed, preprocess
  63. * it by removing what appears to be html markup.
  64. *
  65. * @internal
  66. */
  67. void InputText::MungeInput(UBool fStripTags) {
  68. int srci = 0;
  69. int dsti = 0;
  70. uint8_t b;
  71. bool inMarkup = false;
  72. int32_t openTags = 0;
  73. int32_t badTags = 0;
  74. //
  75. // html / xml markup stripping.
  76. // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
  77. // discard everything within < brackets >
  78. // Count how many total '<' and illegal (nested) '<' occur, so we can make some
  79. // guess as to whether the input was actually marked up at all.
  80. // TODO: Think about how this interacts with EBCDIC charsets that are detected.
  81. if (fStripTags) {
  82. for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
  83. b = fRawInput[srci];
  84. if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
  85. if (inMarkup) {
  86. badTags += 1;
  87. }
  88. inMarkup = true;
  89. openTags += 1;
  90. }
  91. if (! inMarkup) {
  92. fInputBytes[dsti++] = b;
  93. }
  94. if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
  95. inMarkup = false;
  96. }
  97. }
  98. fInputLen = dsti;
  99. }
  100. //
  101. // If it looks like this input wasn't marked up, or if it looks like it's
  102. // essentially nothing but markup abandon the markup stripping.
  103. // Detection will have to work on the unstripped input.
  104. //
  105. if (openTags<5 || openTags/5 < badTags ||
  106. (fInputLen < 100 && fRawLength>600))
  107. {
  108. int32_t limit = fRawLength;
  109. if (limit > BUFFER_SIZE) {
  110. limit = BUFFER_SIZE;
  111. }
  112. for (srci=0; srci<limit; srci++) {
  113. fInputBytes[srci] = fRawInput[srci];
  114. }
  115. fInputLen = srci;
  116. }
  117. //
  118. // Tally up the byte occurrence statistics.
  119. // These are available for use by the various detectors.
  120. //
  121. uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
  122. for (srci = 0; srci < fInputLen; srci += 1) {
  123. fByteStats[fInputBytes[srci]] += 1;
  124. }
  125. for (int32_t i = 0x80; i <= 0x9F; i += 1) {
  126. if (fByteStats[i] != 0) {
  127. fC1Bytes = true;
  128. break;
  129. }
  130. }
  131. }
  132. U_NAMESPACE_END
  133. #endif