You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

249 lines
5.8 KiB

3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
  1. from collections import defaultdict
  2. from typing import Dict, List, Type
  3. from pydantic import BaseModel
  4. from typing_extensions import Literal
  5. from . import examples
  6. from projects.models import (
  7. DOCUMENT_CLASSIFICATION,
  8. IMAGE_CLASSIFICATION,
  9. INTENT_DETECTION_AND_SLOT_FILLING,
  10. SEQ2SEQ,
  11. SEQUENCE_LABELING,
  12. SPEECH2TEXT,
  13. )
  14. encodings = Literal[
  15. "Auto",
  16. "ascii",
  17. "big5",
  18. "big5hkscs",
  19. "cp037",
  20. "cp273",
  21. "cp424",
  22. "cp437",
  23. "cp500",
  24. "cp720",
  25. "cp737",
  26. "cp775",
  27. "cp850",
  28. "cp852",
  29. "cp855",
  30. "cp856",
  31. "cp857",
  32. "cp858",
  33. "cp860",
  34. "cp861",
  35. "cp862",
  36. "cp863",
  37. "cp864",
  38. "cp865",
  39. "cp866",
  40. "cp869",
  41. "cp874",
  42. "cp875",
  43. "cp932",
  44. "cp949",
  45. "cp950",
  46. "cp1006",
  47. "cp1026",
  48. "cp1125",
  49. "cp1140",
  50. "cp1250",
  51. "cp1251",
  52. "cp1252",
  53. "cp1253",
  54. "cp1254",
  55. "cp1255",
  56. "cp1256",
  57. "cp1257",
  58. "cp1258",
  59. "cp65001",
  60. "euc_jp",
  61. "euc_jis_2004",
  62. "euc_jisx0213",
  63. "euc_kr",
  64. "gb2312",
  65. "gbk",
  66. "gb18030",
  67. "hz",
  68. "iso2022_jp",
  69. "iso2022_jp_1",
  70. "iso2022_jp_2",
  71. "iso2022_jp_2004",
  72. "iso2022_jp_3",
  73. "iso2022_jp_ext",
  74. "iso2022_kr",
  75. "latin_1",
  76. "iso8859_2",
  77. "iso8859_3",
  78. "iso8859_4",
  79. "iso8859_5",
  80. "iso8859_6",
  81. "iso8859_7",
  82. "iso8859_8",
  83. "iso8859_9",
  84. "iso8859_10",
  85. "iso8859_11",
  86. "iso8859_13",
  87. "iso8859_14",
  88. "iso8859_15",
  89. "iso8859_16",
  90. "johab",
  91. "koi8_r",
  92. "koi8_t",
  93. "koi8_u",
  94. "kz1048",
  95. "mac_cyrillic",
  96. "mac_greek",
  97. "mac_iceland",
  98. "mac_latin2",
  99. "mac_roman",
  100. "mac_turkish",
  101. "ptcp154",
  102. "shift_jis",
  103. "shift_jis_2004",
  104. "shift_jisx0213",
  105. "utf_32",
  106. "utf_32_be",
  107. "utf_32_le",
  108. "utf_16",
  109. "utf_16_be",
  110. "utf_16_le",
  111. "utf_7",
  112. "utf_8",
  113. "utf_8_sig",
  114. ]
  115. class Format:
  116. name = ""
  117. accept_types = ""
  118. @classmethod
  119. def dict(cls):
  120. return {"name": cls.name, "accept_types": cls.accept_types}
  121. class CSV(Format):
  122. name = "CSV"
  123. accept_types = "text/csv"
  124. class FastText(Format):
  125. name = "fastText"
  126. accept_types = "text/plain"
  127. class JSON(Format):
  128. name = "JSON"
  129. accept_types = "application/json"
  130. class JSONL(Format):
  131. name = "JSONL"
  132. accept_types = "*"
  133. class Excel(Format):
  134. name = "Excel"
  135. accept_types = "application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
  136. class TextFile(Format):
  137. name = "TextFile"
  138. accept_types = "text/*"
  139. class TextLine(Format):
  140. name = "TextLine"
  141. accept_types = "text/*"
  142. class CoNLL(Format):
  143. name = "CoNLL"
  144. accept_types = "text/*"
  145. class ImageFile(Format):
  146. name = "ImageFile"
  147. accept_types = "image/png, image/jpeg, image/bmp, image/gif"
  148. class AudioFile(Format):
  149. name = "AudioFile"
  150. accept_types = "audio/ogg, audio/aac, audio/mpeg, audio/wav"
  151. class OptionColumn(BaseModel):
  152. encoding: encodings = "utf_8"
  153. column_data: str = "text"
  154. column_label: str = "label"
  155. class OptionDelimiter(OptionColumn):
  156. encoding: encodings = "utf_8"
  157. delimiter: Literal[",", "\t", ";", "|", " "] = ","
  158. class OptionEncoding(BaseModel):
  159. encoding: encodings = "utf_8"
  160. class OptionCoNLL(BaseModel):
  161. encoding: encodings = "utf_8"
  162. scheme: Literal["IOB2", "IOE2", "IOBES", "BILOU"] = "IOB2"
  163. delimiter: Literal[" ", ""] = " "
  164. class OptionNone(BaseModel):
  165. pass
  166. class Options:
  167. options: Dict[str, List] = defaultdict(list)
  168. @classmethod
  169. def filter_by_task(cls, task_name: str):
  170. options = cls.options[task_name]
  171. return [{**format.dict(), **option.schema(), "example": example} for format, option, example in options]
  172. @classmethod
  173. def register(cls, task: str, format: Type[Format], option: Type[BaseModel], example: str):
  174. cls.options[task].append((format, option, example))
  175. # Text Classification
  176. Options.register(DOCUMENT_CLASSIFICATION, TextFile, OptionEncoding, examples.Generic_TextFile)
  177. Options.register(DOCUMENT_CLASSIFICATION, TextLine, OptionEncoding, examples.Generic_TextLine)
  178. Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
  179. Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionEncoding, examples.Category_fastText)
  180. Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionColumn, examples.Category_JSON)
  181. Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionColumn, examples.Category_JSONL)
  182. Options.register(DOCUMENT_CLASSIFICATION, Excel, OptionColumn, examples.Category_CSV)
  183. # Sequence Labeling
  184. Options.register(SEQUENCE_LABELING, TextFile, OptionEncoding, examples.Generic_TextFile)
  185. Options.register(SEQUENCE_LABELING, TextLine, OptionEncoding, examples.Generic_TextLine)
  186. Options.register(SEQUENCE_LABELING, JSONL, OptionColumn, examples.Offset_JSONL)
  187. Options.register(SEQUENCE_LABELING, CoNLL, OptionCoNLL, examples.Offset_CoNLL)
  188. # Sequence to sequence
  189. Options.register(SEQ2SEQ, TextFile, OptionEncoding, examples.Generic_TextFile)
  190. Options.register(SEQ2SEQ, TextLine, OptionEncoding, examples.Generic_TextLine)
  191. Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
  192. Options.register(SEQ2SEQ, JSON, OptionColumn, examples.Text_JSON)
  193. Options.register(SEQ2SEQ, JSONL, OptionColumn, examples.Text_JSONL)
  194. Options.register(SEQ2SEQ, Excel, OptionColumn, examples.Text_CSV)
  195. # Intent detection and slof filling
  196. Options.register(INTENT_DETECTION_AND_SLOT_FILLING, TextFile, OptionEncoding, examples.Generic_TextFile)
  197. Options.register(INTENT_DETECTION_AND_SLOT_FILLING, TextLine, OptionEncoding, examples.Generic_TextLine)
  198. Options.register(INTENT_DETECTION_AND_SLOT_FILLING, JSONL, OptionNone, examples.IDSF_JSONL)
  199. # Image classification
  200. Options.register(IMAGE_CLASSIFICATION, ImageFile, OptionNone, examples.Generic_ImageFile)
  201. # Speech to Text
  202. Options.register(SPEECH2TEXT, AudioFile, OptionNone, examples.Generic_AudioFile)