You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

251 lines
5.6 KiB

3 years ago
3 years ago
3 years ago
  1. from collections import defaultdict
  2. from typing import Dict, List, Type
  3. from pydantic import BaseModel
  4. from typing_extensions import Literal
  5. from ...models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION, SEQ2SEQ,
  6. SEQUENCE_LABELING, SPEECH2TEXT)
  7. from . import examples
  8. encodings = Literal[
  9. 'Auto',
  10. 'ascii',
  11. 'big5',
  12. 'big5hkscs',
  13. 'cp037',
  14. 'cp273',
  15. 'cp424',
  16. 'cp437',
  17. 'cp500',
  18. 'cp720',
  19. 'cp737',
  20. 'cp775',
  21. 'cp850',
  22. 'cp852',
  23. 'cp855',
  24. 'cp856',
  25. 'cp857',
  26. 'cp858',
  27. 'cp860',
  28. 'cp861',
  29. 'cp862',
  30. 'cp863',
  31. 'cp864',
  32. 'cp865',
  33. 'cp866',
  34. 'cp869',
  35. 'cp874',
  36. 'cp875',
  37. 'cp932',
  38. 'cp949',
  39. 'cp950',
  40. 'cp1006',
  41. 'cp1026',
  42. 'cp1125',
  43. 'cp1140',
  44. 'cp1250',
  45. 'cp1251',
  46. 'cp1252',
  47. 'cp1253',
  48. 'cp1254',
  49. 'cp1255',
  50. 'cp1256',
  51. 'cp1257',
  52. 'cp1258',
  53. 'cp65001',
  54. 'euc_jp',
  55. 'euc_jis_2004',
  56. 'euc_jisx0213',
  57. 'euc_kr',
  58. 'gb2312',
  59. 'gbk',
  60. 'gb18030',
  61. 'hz',
  62. 'iso2022_jp',
  63. 'iso2022_jp_1',
  64. 'iso2022_jp_2',
  65. 'iso2022_jp_2004',
  66. 'iso2022_jp_3',
  67. 'iso2022_jp_ext',
  68. 'iso2022_kr',
  69. 'latin_1',
  70. 'iso8859_2',
  71. 'iso8859_3',
  72. 'iso8859_4',
  73. 'iso8859_5',
  74. 'iso8859_6',
  75. 'iso8859_7',
  76. 'iso8859_8',
  77. 'iso8859_9',
  78. 'iso8859_10',
  79. 'iso8859_11',
  80. 'iso8859_13',
  81. 'iso8859_14',
  82. 'iso8859_15',
  83. 'iso8859_16',
  84. 'johab',
  85. 'koi8_r',
  86. 'koi8_t',
  87. 'koi8_u',
  88. 'kz1048',
  89. 'mac_cyrillic',
  90. 'mac_greek',
  91. 'mac_iceland',
  92. 'mac_latin2',
  93. 'mac_roman',
  94. 'mac_turkish',
  95. 'ptcp154',
  96. 'shift_jis',
  97. 'shift_jis_2004',
  98. 'shift_jisx0213',
  99. 'utf_32',
  100. 'utf_32_be',
  101. 'utf_32_le',
  102. 'utf_16',
  103. 'utf_16_be',
  104. 'utf_16_le',
  105. 'utf_7',
  106. 'utf_8',
  107. 'utf_8_sig'
  108. ]
  109. class Format:
  110. name = ''
  111. accept_types = ''
  112. @classmethod
  113. def dict(cls):
  114. return {
  115. 'name': cls.name,
  116. 'accept_types': cls.accept_types
  117. }
  118. class CSV(Format):
  119. name = 'CSV'
  120. accept_types = 'text/csv'
  121. class FastText(Format):
  122. name = 'fastText'
  123. accept_types = 'text/plain'
  124. class JSON(Format):
  125. name = 'JSON'
  126. accept_types = 'application/json'
  127. class JSONL(Format):
  128. name = 'JSONL'
  129. accept_types = '*'
  130. class Excel(Format):
  131. name = 'Excel'
  132. accept_types = 'application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
  133. class TextFile(Format):
  134. name = 'TextFile'
  135. accept_types = 'text/*'
  136. class TextLine(Format):
  137. name = 'TextLine'
  138. accept_types = 'text/*'
  139. class CoNLL(Format):
  140. name = 'CoNLL'
  141. accept_types = 'text/*'
  142. class ImageFile(Format):
  143. name = 'ImageFile'
  144. accept_types = 'image/png, image/jpeg, image/bmp, image/gif'
  145. class AudioFile(Format):
  146. name = 'AudioFile'
  147. accept_types = 'audio/ogg, audio/aac, audio/mpeg, audio/wav'
  148. class OptionColumn(BaseModel):
  149. encoding: encodings = 'utf_8'
  150. column_data: str = 'text'
  151. column_label: str = 'label'
  152. class OptionDelimiter(OptionColumn):
  153. encoding: encodings = 'utf_8'
  154. delimiter: Literal[',', '\t', ';', '|', ' '] = ','
  155. class OptionEncoding(BaseModel):
  156. encoding: encodings = 'utf_8'
  157. class OptionCoNLL(BaseModel):
  158. encoding: encodings = 'utf_8'
  159. scheme: Literal['IOB2', 'IOE2', 'IOBES', 'BILOU'] = 'IOB2'
  160. delimiter: Literal[' ', ''] = ' '
  161. class OptionNone(BaseModel):
  162. pass
  163. class Options:
  164. options: Dict[str, List] = defaultdict(list)
  165. @classmethod
  166. def filter_by_task(cls, task_name: str):
  167. options = cls.options[task_name]
  168. return [
  169. {
  170. **format.dict(),
  171. **option.schema(),
  172. 'example': example
  173. } for format, option, example in options
  174. ]
  175. @classmethod
  176. def register(cls,
  177. task: str,
  178. format: Type[Format],
  179. option: Type[BaseModel],
  180. example: str):
  181. cls.options[task].append((format, option, example))
  182. # Text Classification
  183. Options.register(DOCUMENT_CLASSIFICATION, TextFile, OptionEncoding, examples.Generic_TextFile)
  184. Options.register(DOCUMENT_CLASSIFICATION, TextLine, OptionEncoding, examples.Generic_TextLine)
  185. Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
  186. Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionEncoding, examples.Category_fastText)
  187. Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionColumn, examples.Category_JSON)
  188. Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionColumn, examples.Category_JSONL)
  189. Options.register(DOCUMENT_CLASSIFICATION, Excel, OptionColumn, examples.Category_CSV)
  190. # Sequence Labeling
  191. Options.register(SEQUENCE_LABELING, TextFile, OptionEncoding, examples.Generic_TextFile)
  192. Options.register(SEQUENCE_LABELING, TextLine, OptionEncoding, examples.Generic_TextLine)
  193. Options.register(SEQUENCE_LABELING, JSONL, OptionColumn, examples.Offset_JSONL)
  194. Options.register(SEQUENCE_LABELING, CoNLL, OptionCoNLL, examples.Offset_CoNLL)
  195. # Sequence to sequence
  196. Options.register(SEQ2SEQ, TextFile, OptionEncoding, examples.Generic_TextFile)
  197. Options.register(SEQ2SEQ, TextLine, OptionEncoding, examples.Generic_TextLine)
  198. Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
  199. Options.register(SEQ2SEQ, JSON, OptionColumn, examples.Text_JSON)
  200. Options.register(SEQ2SEQ, JSONL, OptionColumn, examples.Text_JSONL)
  201. Options.register(SEQ2SEQ, Excel, OptionColumn, examples.Text_CSV)
  202. # Image classification
  203. Options.register(IMAGE_CLASSIFICATION, ImageFile, OptionNone, examples.Generic_ImageFile)
  204. # Speech to Text
  205. Options.register(SPEECH2TEXT, AudioFile, OptionNone, examples.Generic_AudioFile)