-
Notifications
You must be signed in to change notification settings - Fork 4.8k
HIVE-29514: Optimize UDF Unhex and improve its test coverage #6471
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,32 +42,59 @@ | |
| public class UDFUnhex extends UDF { | ||
|
|
||
| /** | ||
| * Convert every two hex digits in s into. | ||
| * | ||
| * Convert every two hex digits in s into a byte. | ||
| */ | ||
| public byte[] evaluate(Text s) { | ||
| if (s == null) { | ||
| return null; | ||
| } | ||
|
|
||
| // append a leading 0 if needed | ||
| String str; | ||
| if (s.getLength() % 2 == 1) { | ||
| str = "0" + s.toString(); | ||
| } else { | ||
| str = s.toString(); | ||
| int len = s.getLength(); | ||
| if (len == 0) { | ||
| return new byte[0]; | ||
| } | ||
|
|
||
| byte[] result = new byte[str.length() / 2]; | ||
| for (int i = 0; i < str.length(); i += 2) { | ||
| try { | ||
| result[i / 2] = ((byte) Integer.parseInt(str.substring(i, i + 2), 16)); | ||
| } catch (NumberFormatException e) { | ||
| // invalid character present, return null | ||
| byte[] textBytes = s.getBytes(); | ||
|
|
||
| // (len + 1) / 2 ensures right size for odd lengths | ||
| byte[] result = new byte[(len + 1) / 2]; | ||
|
|
||
| int i = 0; | ||
| int resIdx = 0; | ||
|
|
||
| // If length is odd, the first character acts as the first byte avoiding adding "0" prefix | ||
| if (len % 2 != 0) { | ||
| int val = decodeHexChar(textBytes[i++]); | ||
| if (val == -1) { | ||
| return null; | ||
| } | ||
| result[resIdx++] = (byte) val; | ||
| } | ||
|
|
||
| while (i < len) { | ||
| int high = decodeHexChar(textBytes[i++]); | ||
| int low = decodeHexChar(textBytes[i++]); | ||
|
|
||
| if (high == -1 || low == -1) { | ||
| return null; | ||
|
Check warning on line 79 in ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUnhex.java
|
||
| } | ||
|
|
||
| result[resIdx++] = (byte) ((high << 4) | low); | ||
| } | ||
|
Comment on lines
+74
to
83
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about: so that we don't compute |
||
|
|
||
| return result; | ||
| } | ||
|
|
||
| private int decodeHexChar(byte b) { | ||
| if (b >= '0' && b <= '9') { | ||
| return b - '0'; | ||
| } | ||
| if (b >= 'a' && b <= 'f') { | ||
| return b - 'a' + 10; | ||
| } | ||
| if (b >= 'A' && b <= 'F') { | ||
| return b - 'A' + 10; | ||
| } | ||
| return -1; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,9 @@ | |
|
|
||
|
|
||
| import org.apache.hadoop.io.Text; | ||
| import static org.junit.Assert.assertArrayEquals; | ||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.Assert.assertNull; | ||
| import org.junit.Test; | ||
|
|
||
| /** | ||
|
|
@@ -40,8 +42,41 @@ public void testUnhexConversion(){ | |
| UDFUnhex udf = new UDFUnhex(); | ||
| byte[] output = udf.evaluate(hex); | ||
| assertEquals(expected.length,output.length); | ||
| for (int i = 0; i < expected.length; i++){ | ||
| assertEquals(expected[i], output[i]); | ||
| } | ||
| assertArrayEquals(expected, output); | ||
| } | ||
|
|
||
| @Test | ||
| public void testUnhexOddLength() { | ||
| UDFUnhex udf = new UDFUnhex(); | ||
|
|
||
| Text hex1 = new Text("A"); | ||
| byte[] expected1 = new byte[] {(byte) 0x0A}; | ||
| assertArrayEquals(expected1, udf.evaluate(hex1)); | ||
|
|
||
| Text hex2 = new Text("123"); | ||
| byte[] expected2 = new byte[] {(byte) 0x01, (byte) 0x23}; | ||
| assertArrayEquals(expected2, udf.evaluate(hex2)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testUnhexInvalidCharacters() { | ||
| UDFUnhex udf = new UDFUnhex(); | ||
|
|
||
| Text hex = new Text("7374G9"); | ||
| assertNull("Should return null for invalid hex characters", udf.evaluate(hex)); | ||
|
|
||
| Text hexOddInvalid = new Text("12G"); | ||
| assertNull("Should return null for invalid hex characters in odd length string", udf.evaluate(hexOddInvalid)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testUnhexNullEmptyCases() { | ||
| UDFUnhex udf = new UDFUnhex(); | ||
|
|
||
| assertNull(udf.evaluate(null)); | ||
|
|
||
| Text hexEmpty = new Text(""); | ||
| byte[] expectedEmpty = new byte[0]; | ||
| assertArrayEquals(expectedEmpty, udf.evaluate(hexEmpty)); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to have tests for
Also, maybe not in this file, but are there any tests for |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a test for invalid character in an odd-length input.
Example:
G.