You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

257 lines
6.0 KiB

3 years ago
3 years ago
3 years ago
  1. from collections import defaultdict
  2. from typing import Dict, List, Type
  3. from pydantic import BaseModel
  4. from typing_extensions import Literal
  5. from api.models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
  6. INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
  7. SEQUENCE_LABELING, SPEECH2TEXT)
  8. from . import examples
  9. encodings = Literal[
  10. 'Auto',
  11. 'ascii',
  12. 'big5',
  13. 'big5hkscs',
  14. 'cp037',
  15. 'cp273',
  16. 'cp424',
  17. 'cp437',
  18. 'cp500',
  19. 'cp720',
  20. 'cp737',
  21. 'cp775',
  22. 'cp850',
  23. 'cp852',
  24. 'cp855',
  25. 'cp856',
  26. 'cp857',
  27. 'cp858',
  28. 'cp860',
  29. 'cp861',
  30. 'cp862',
  31. 'cp863',
  32. 'cp864',
  33. 'cp865',
  34. 'cp866',
  35. 'cp869',
  36. 'cp874',
  37. 'cp875',
  38. 'cp932',
  39. 'cp949',
  40. 'cp950',
  41. 'cp1006',
  42. 'cp1026',
  43. 'cp1125',
  44. 'cp1140',
  45. 'cp1250',
  46. 'cp1251',
  47. 'cp1252',
  48. 'cp1253',
  49. 'cp1254',
  50. 'cp1255',
  51. 'cp1256',
  52. 'cp1257',
  53. 'cp1258',
  54. 'cp65001',
  55. 'euc_jp',
  56. 'euc_jis_2004',
  57. 'euc_jisx0213',
  58. 'euc_kr',
  59. 'gb2312',
  60. 'gbk',
  61. 'gb18030',
  62. 'hz',
  63. 'iso2022_jp',
  64. 'iso2022_jp_1',
  65. 'iso2022_jp_2',
  66. 'iso2022_jp_2004',
  67. 'iso2022_jp_3',
  68. 'iso2022_jp_ext',
  69. 'iso2022_kr',
  70. 'latin_1',
  71. 'iso8859_2',
  72. 'iso8859_3',
  73. 'iso8859_4',
  74. 'iso8859_5',
  75. 'iso8859_6',
  76. 'iso8859_7',
  77. 'iso8859_8',
  78. 'iso8859_9',
  79. 'iso8859_10',
  80. 'iso8859_11',
  81. 'iso8859_13',
  82. 'iso8859_14',
  83. 'iso8859_15',
  84. 'iso8859_16',
  85. 'johab',
  86. 'koi8_r',
  87. 'koi8_t',
  88. 'koi8_u',
  89. 'kz1048',
  90. 'mac_cyrillic',
  91. 'mac_greek',
  92. 'mac_iceland',
  93. 'mac_latin2',
  94. 'mac_roman',
  95. 'mac_turkish',
  96. 'ptcp154',
  97. 'shift_jis',
  98. 'shift_jis_2004',
  99. 'shift_jisx0213',
  100. 'utf_32',
  101. 'utf_32_be',
  102. 'utf_32_le',
  103. 'utf_16',
  104. 'utf_16_be',
  105. 'utf_16_le',
  106. 'utf_7',
  107. 'utf_8',
  108. 'utf_8_sig'
  109. ]
  110. class Format:
  111. name = ''
  112. accept_types = ''
  113. @classmethod
  114. def dict(cls):
  115. return {
  116. 'name': cls.name,
  117. 'accept_types': cls.accept_types
  118. }
  119. class CSV(Format):
  120. name = 'CSV'
  121. accept_types = 'text/csv'
  122. class FastText(Format):
  123. name = 'fastText'
  124. accept_types = 'text/plain'
  125. class JSON(Format):
  126. name = 'JSON'
  127. accept_types = 'application/json'
  128. class JSONL(Format):
  129. name = 'JSONL'
  130. accept_types = '*'
  131. class Excel(Format):
  132. name = 'Excel'
  133. accept_types = 'application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
  134. class TextFile(Format):
  135. name = 'TextFile'
  136. accept_types = 'text/*'
  137. class TextLine(Format):
  138. name = 'TextLine'
  139. accept_types = 'text/*'
  140. class CoNLL(Format):
  141. name = 'CoNLL'
  142. accept_types = 'text/*'
  143. class ImageFile(Format):
  144. name = 'ImageFile'
  145. accept_types = 'image/png, image/jpeg, image/bmp, image/gif'
  146. class AudioFile(Format):
  147. name = 'AudioFile'
  148. accept_types = 'audio/ogg, audio/aac, audio/mpeg, audio/wav'
  149. class OptionColumn(BaseModel):
  150. encoding: encodings = 'utf_8'
  151. column_data: str = 'text'
  152. column_label: str = 'label'
  153. class OptionDelimiter(OptionColumn):
  154. encoding: encodings = 'utf_8'
  155. delimiter: Literal[',', '\t', ';', '|', ' '] = ','
  156. class OptionEncoding(BaseModel):
  157. encoding: encodings = 'utf_8'
  158. class OptionCoNLL(BaseModel):
  159. encoding: encodings = 'utf_8'
  160. scheme: Literal['IOB2', 'IOE2', 'IOBES', 'BILOU'] = 'IOB2'
  161. delimiter: Literal[' ', ''] = ' '
  162. class OptionNone(BaseModel):
  163. pass
  164. class Options:
  165. options: Dict[str, List] = defaultdict(list)
  166. @classmethod
  167. def filter_by_task(cls, task_name: str):
  168. options = cls.options[task_name]
  169. return [
  170. {
  171. **format.dict(),
  172. **option.schema(),
  173. 'example': example
  174. } for format, option, example in options
  175. ]
  176. @classmethod
  177. def register(cls,
  178. task: str,
  179. format: Type[Format],
  180. option: Type[BaseModel],
  181. example: str):
  182. cls.options[task].append((format, option, example))
  183. # Text Classification
  184. Options.register(DOCUMENT_CLASSIFICATION, TextFile, OptionEncoding, examples.Generic_TextFile)
  185. Options.register(DOCUMENT_CLASSIFICATION, TextLine, OptionEncoding, examples.Generic_TextLine)
  186. Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
  187. Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionEncoding, examples.Category_fastText)
  188. Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionColumn, examples.Category_JSON)
  189. Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionColumn, examples.Category_JSONL)
  190. Options.register(DOCUMENT_CLASSIFICATION, Excel, OptionColumn, examples.Category_CSV)
  191. # Sequence Labeling
  192. Options.register(SEQUENCE_LABELING, TextFile, OptionEncoding, examples.Generic_TextFile)
  193. Options.register(SEQUENCE_LABELING, TextLine, OptionEncoding, examples.Generic_TextLine)
  194. Options.register(SEQUENCE_LABELING, JSONL, OptionColumn, examples.Offset_JSONL)
  195. Options.register(SEQUENCE_LABELING, CoNLL, OptionCoNLL, examples.Offset_CoNLL)
  196. # Sequence to sequence
  197. Options.register(SEQ2SEQ, TextFile, OptionEncoding, examples.Generic_TextFile)
  198. Options.register(SEQ2SEQ, TextLine, OptionEncoding, examples.Generic_TextLine)
  199. Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
  200. Options.register(SEQ2SEQ, JSON, OptionColumn, examples.Text_JSON)
  201. Options.register(SEQ2SEQ, JSONL, OptionColumn, examples.Text_JSONL)
  202. Options.register(SEQ2SEQ, Excel, OptionColumn, examples.Text_CSV)
  203. # Intent detection and slof filling
  204. Options.register(INTENT_DETECTION_AND_SLOT_FILLING, TextFile, OptionEncoding, examples.Generic_TextFile)
  205. Options.register(INTENT_DETECTION_AND_SLOT_FILLING, TextLine, OptionEncoding, examples.Generic_TextLine)
  206. Options.register(INTENT_DETECTION_AND_SLOT_FILLING, JSONL, OptionNone, examples.IDSF_JSONL)
  207. # Image classification
  208. Options.register(IMAGE_CLASSIFICATION, ImageFile, OptionNone, examples.Generic_ImageFile)
  209. # Speech to Text
  210. Options.register(SPEECH2TEXT, AudioFile, OptionNone, examples.Generic_AudioFile)